Code to import SQL database into juypyter¶

In [1]:
import sqlite3
import pandas as pd

# Connecting to the SQLite database
conn = sqlite3.connect('/Users/devinpathiraja/Desktop/IFN704 /lol.db') 

# Function to process each chunk of data
def process_chunk(chunk):
    # Replacing empty strings with NaN
    chunk.replace('', pd.NA, inplace=True)
    
    # Removing rows with missing values in specified columns
    chunk = chunk.dropna(subset=['match_id', 'puuid', 'teamPosition', 'win', 'teamId'])
    
    # Remove rows where puuid is 'BOT'
    chunk = chunk[chunk['puuid'] != 'BOT']
    
    return chunk

# SQL query to extract features
query = """
SELECT puuid, match_id, teamPosition, win, teamId
FROM participant
"""

# Executing the query and loading the results into a pandas DataFrame, processing in chunks
chunk_size = 100000  
df_list = []

for chunk in pd.read_sql_query(query, conn, chunksize=chunk_size):
    processed_chunk = process_chunk(chunk)
    df_list.append(processed_chunk)

# Concatenating all processed chunks
df = pd.concat(df_list, ignore_index=True)

# Closing the database connection
conn.close()

Getting match data¶

In [9]:
def get_match_data(df, match_id):
    # Filter the DataFrame for the specific match_id
    match_data = df[df['match_id'] == match_id]
    
    # Sort the data by teamId to group players by team
    match_data = match_data.sort_values('teamId')
    
    print(f"Data for match {match_id}:")
    print(f"Number of players found: {len(match_data)}")
    
    for index, row in match_data.iterrows():
        print(f"Player {index + 1}:")
        print(f"  PUUID: {row['puuid']}")
        print(f"  Position: {row['teamPosition']}")
        print(f"  Team ID: {row['teamId']}")
        print(f"  Win: {row['win']}")
        print()

# Specifying the match_id to check
match_id_to_check = 'OC1_564876644'

#Printing match data
get_match_data(df, match_id_to_check)
Data for match OC1_564876644:
Number of players found: 10
Player 1675:
  PUUID: UoBtQQIjzVdYoQnCE3v7kauzvPBm_UFbuVYAxFEO7NrxLx5ERnaoigWwBaqvr0S7COnlXLch6-fhqw
  Position: TOP
  Team ID: 100
  Win: 0

Player 1676:
  PUUID: FH8BKVK4gxBd8yd3w658M_k02mdHq_8seh_8CLEbbxFxCrIChhdbU46NzHiP0mYq55dPfI4U5q8Ivg
  Position: JUNGLE
  Team ID: 100
  Win: 0

Player 1677:
  PUUID: n7Vx94O6wlXye4RI11L7SxbEXH-sPXGnQyKTtCr7uClPzNda5199rdKQ-cR34q2W-CAmHc8OZL2SFg
  Position: MIDDLE
  Team ID: 100
  Win: 0

Player 1678:
  PUUID: cXPc4OrBrqibrLdDhgo4WZyR_gTpu7Bs65RmiY6yupMSAn07i1xnP7DaerYZWrwwdLLTr-UAfC0alw
  Position: BOTTOM
  Team ID: 100
  Win: 0

Player 1679:
  PUUID: NPD9MTbIW0kgE2fFwkvvudSq0_FtvLnJbON0s6zc66luaThe8HCALsAxcb9-V7WWLT6A9bBPMnQbkg
  Position: UTILITY
  Team ID: 100
  Win: 0

Player 1680:
  PUUID: CBWHup3kaETfy6ziW2Ty3d8jMbcUmhya-UtDpBCHilp8NFOGCQKpaCULCPHkfDiQmX-30z8a7BX1mQ
  Position: TOP
  Team ID: 200
  Win: 1

Player 1681:
  PUUID: tPJAHi7KaWwVA3Jt4ubwuArADbx7WCZukgWRgbszdvhtQTMVr24_1FWSZY89H1jTOOtiGUvb2qD9dw
  Position: JUNGLE
  Team ID: 200
  Win: 1

Player 1682:
  PUUID: sduZVJ-zJSh9qJKqV_0wnQN0EBhNnvhFg9u0AtGqXUYFjnpcbt8BvyBHyfxo-hAt9xgS-Uivq1GtrQ
  Position: MIDDLE
  Team ID: 200
  Win: 1

Player 1683:
  PUUID: 1xLRZHol5hz1JEI18bX_7nTpEDX03vqJN3VtnncTudnGbEHLKXLvD536iUFVpoH7bcsb5mBGVW4Bcw
  Position: BOTTOM
  Team ID: 200
  Win: 1

Player 1684:
  PUUID: 3dgn7-Fwe4mYKEFjyzovEh1nteHrt7B3EZUT1RP-mZ5vRYUEkjRDlWlZT6hoD2CaqZHUrCAhsXwEsw
  Position: UTILITY
  Team ID: 200
  Win: 1

Increasing Display width¶

In [3]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Set width to unlimited
pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.expand_frame_repr', False) 

Creating match summary¶

In [10]:
from tqdm import tqdm
import math

def create_match_summary(df):
    # Creating a player number for each match
    print("Creating player numbers...")
    df['player_num'] = df.groupby('match_id').cumcount() + 1

    # Pivoting the dataframe
    print("Pivoting dataframe...")
    pivoted = df.pivot(index='match_id', columns='player_num', 
                       values=['puuid', 'teamPosition', 'teamId', 'win'])

    # Flatten column names
    print("Flattening column names...")
    pivoted.columns = [f'{col[0]}_{col[1]}' for col in pivoted.columns]

    # Reseting index to make match_id a column
    pivoted = pivoted.reset_index()

    # Ensure all columns are present (in case some matches have fewer than 10 players)
    print("Ensuring all columns are present...")
    total_iterations = 40  # 10 players * 4 columns
    with tqdm(total=total_iterations) as pbar:
        for i in range(1, 11):
            for col in ['puuid', 'teamPosition', 'teamId', 'win']:
                if f'{col}_{i}' not in pivoted.columns:
                    pivoted[f'{col}_{i}'] = pd.NA
                pbar.update(1)

    # Reorder columns
    print("Reordering columns...")
    cols = ['match_id'] + [f'{col}_{i}' for i in range(1, 11) for col in ['puuid', 'teamPosition', 'teamId', 'win']]
    pivoted = pivoted[cols]

    return pivoted

# Creating the new DataFrame
print("Starting to create match summary...")
df_match_summary = create_match_summary(df)

print("Match summary created. Displaying first few rows...")
print(df_match_summary.head())

print("\nDataFrame Info:")
print(df_match_summary.info())

# Checking a specific match
match_id_to_check = 'OC1_564876644'
print(f"\nData for match {match_id_to_check}:")
print(df_match_summary[df_match_summary['match_id'] == match_id_to_check].iloc[0])

print("\nProcess completed successfully.")
Starting to create match summary...
Creating player numbers...
Pivoting dataframe...
Flattening column names...
Ensuring all columns are present...
100%|████████████████████████████████████████| 40/40 [00:00<00:00, 61342.65it/s]
Reordering columns...
Match summary created. Displaying first few rows...
        match_id                                                                         puuid_1 teamPosition_1 teamId_1 win_1                                                                         puuid_2 teamPosition_2 teamId_2 win_2                                                                         puuid_3 teamPosition_3 teamId_3 win_3                                                                         puuid_4 teamPosition_4 teamId_4 win_4                                                                         puuid_5 teamPosition_5 teamId_5 win_5                                                                         puuid_6 teamPosition_6 teamId_6 win_6                                                                         puuid_7 teamPosition_7 teamId_7 win_7                                                                         puuid_8 teamPosition_8 teamId_8 win_8                                                                         puuid_9 teamPosition_9 teamId_9 win_9                                                                        puuid_10 teamPosition_10 teamId_10 win_10
0  OC1_564876644  UoBtQQIjzVdYoQnCE3v7kauzvPBm_UFbuVYAxFEO7NrxLx5ERnaoigWwBaqvr0S7COnlXLch6-fhqw            TOP      100     0  FH8BKVK4gxBd8yd3w658M_k02mdHq_8seh_8CLEbbxFxCrIChhdbU46NzHiP0mYq55dPfI4U5q8Ivg         JUNGLE      100     0  n7Vx94O6wlXye4RI11L7SxbEXH-sPXGnQyKTtCr7uClPzNda5199rdKQ-cR34q2W-CAmHc8OZL2SFg         MIDDLE      100     0  cXPc4OrBrqibrLdDhgo4WZyR_gTpu7Bs65RmiY6yupMSAn07i1xnP7DaerYZWrwwdLLTr-UAfC0alw         BOTTOM      100     0  NPD9MTbIW0kgE2fFwkvvudSq0_FtvLnJbON0s6zc66luaThe8HCALsAxcb9-V7WWLT6A9bBPMnQbkg        UTILITY      100     0  CBWHup3kaETfy6ziW2Ty3d8jMbcUmhya-UtDpBCHilp8NFOGCQKpaCULCPHkfDiQmX-30z8a7BX1mQ            TOP      200     1  tPJAHi7KaWwVA3Jt4ubwuArADbx7WCZukgWRgbszdvhtQTMVr24_1FWSZY89H1jTOOtiGUvb2qD9dw         JUNGLE      200     1  sduZVJ-zJSh9qJKqV_0wnQN0EBhNnvhFg9u0AtGqXUYFjnpcbt8BvyBHyfxo-hAt9xgS-Uivq1GtrQ         MIDDLE      200     1  1xLRZHol5hz1JEI18bX_7nTpEDX03vqJN3VtnncTudnGbEHLKXLvD536iUFVpoH7bcsb5mBGVW4Bcw         BOTTOM      200     1  3dgn7-Fwe4mYKEFjyzovEh1nteHrt7B3EZUT1RP-mZ5vRYUEkjRDlWlZT6hoD2CaqZHUrCAhsXwEsw         UTILITY       200      1
1  OC1_564882293  slIf7LP4Kr_1_CpGyOmL_BkzGSgGLNhAos6mRON5nOxvab8d1P7o6JPuqPeYQB5NBVeiNeGJzPCjyw            TOP      100     0  jxud22N5_fkeJ0jmp9krUfNgcSkYXWe9nDtFk1hO4uTRndQe2KjVI4Ey4kO-_p2mvfKsckpQYQRi2A         JUNGLE      100     0  tPJAHi7KaWwVA3Jt4ubwuArADbx7WCZukgWRgbszdvhtQTMVr24_1FWSZY89H1jTOOtiGUvb2qD9dw         MIDDLE      100     0  lo6YQidE22L5lMcvWrCrj5qFp7tdEa1gVHE9N6TB74HKPiwtqKagY9SofLhl5DiPMrx7v6CKxHAN7Q         BOTTOM      100     0  AJl3Ebo3uqJF7aOgPcvDPAasbhtNGufLGGGnCbedT_hvX_MI0ZWgwW1PADy8P1b9MJuLS6FnGrMkMQ        UTILITY      100     0  v7MdVgS5I_zXT04dcpOkqPk1AqL_G1-88gXH9Lg8u7-wIe6v6pV_YZ0awUWWs7VamaX25Y9Onb0xKw            TOP      200     1  huEycrZJfGKtB_sgan83kapEoISNKxW9BOAmanMFmuMkrNX5W7g9KZELdvllV2QW5oFjxF2cxlmeVw         JUNGLE      200     1  NO-0JdAO96OJkcVn9xnxizcamw6uEfwNRodM6hxyd8pKhi5IoiD9ZBf_Dr6CXh2cvDjm2Lf2lIlOpg         MIDDLE      200     1  e590wZ75DqKgcBj3wJeDY_01IBkek6ggWVdy96N8XbURMggPu3_CwLQcbvCRp4Ps_BnGUu4e3NBIOQ         BOTTOM      200     1  0sgU6iVda79lXfS-cdbZlT73EG5pvM6LUPZ9X5NClrRyhYqnjGdLlKkudNihZMhVOhr7PQ7uvBT13w         UTILITY       200      1
2  OC1_564915288  yWgGWIoPSMAQBQUmy4Y1kOJvZJP21vpmm9e4WG735KWTWqVddo7evtnWcsp3On4aXTzVSRhQW1vPEA         JUNGLE      100     1  C_FZOCg6LqBghfsji1PfM4Bruvrz_tKG-8RNVqsERrggeryB8rC8St5w3K3DkwFpSsiGVk32ru4nAg        UTILITY      100     1  ABmjzLoNuG0qPBBBitzk2UH_pPYZ05qGSaV-A9IR4fe_U4t2fRBx8PNzYCwlvfMHs2a0o6b6Cb8bmg            TOP      100     1  K8j8tLFucuT55DnCwx6kgnnBZcE7uYNj_nfNHiB2Axyx6XTOquOWgxZ-FBcOgvo1yQShjiyX0wm8mQ         MIDDLE      100     1  8s0y4xJWaILip4jvfh2FJv09G8chFK_S7bVNEY2rBt0LNRl5d-hAS0yCX2jeK6W-15w91r59epQxFA         BOTTOM      100     1  wpLwEIMeFjr7EnLQTebvOwcp9SUPa4LTFFcF8G3GrnZ5PiPix386fUswhAGNSSUCfLyK9G2rYR_eaQ         JUNGLE      200     0  R1jIzEuV7NrUbtF2hi6DUt4F0V9Zf2-aMMzleN45DjpSYh_zosdiakV1xGO3tbARU34lY67SFCO3jw         BOTTOM      200     0  2OAqPQHu6hIQh7wQK1BCrqptm-7rJptL0d3dBUIDX5xIVN08t7u8nq6tYwI-8C4VKGtC4ytVFYH-zw        UTILITY      200     0  GXb3f35CnWvE7ripK-fmLq9rgaAAc8-ScKDOgTYKLzJ1TKy-TXXm51d307Hx03mhjMYCl0rEyEC7jg         MIDDLE      200     0  YQk0vHWHy21aTrgtIoGpSobtK5mbdqmSaVmyBnK3DBfdIMNsna_QNCVpwUwFJii-UXxqrfkQ-x9fOA             TOP       200      0
3  OC1_564917000  YQk0vHWHy21aTrgtIoGpSobtK5mbdqmSaVmyBnK3DBfdIMNsna_QNCVpwUwFJii-UXxqrfkQ-x9fOA            TOP      100     1  R1jIzEuV7NrUbtF2hi6DUt4F0V9Zf2-aMMzleN45DjpSYh_zosdiakV1xGO3tbARU34lY67SFCO3jw         JUNGLE      100     1  GXb3f35CnWvE7ripK-fmLq9rgaAAc8-ScKDOgTYKLzJ1TKy-TXXm51d307Hx03mhjMYCl0rEyEC7jg         MIDDLE      100     1  wpLwEIMeFjr7EnLQTebvOwcp9SUPa4LTFFcF8G3GrnZ5PiPix386fUswhAGNSSUCfLyK9G2rYR_eaQ         BOTTOM      100     1  2OAqPQHu6hIQh7wQK1BCrqptm-7rJptL0d3dBUIDX5xIVN08t7u8nq6tYwI-8C4VKGtC4ytVFYH-zw        UTILITY      100     1  3QC7ggjT3H5kTiV9RbU3lzYZ7yi3fqQYf38byNdOhKWje1yu1G07SizvW5HmgtJe_fhevXOonPZ21g            TOP      200     0  rQE66nEYxzGNJBukM3YlIOH3hsH1avRp1cnfDCmXoJtf5XyIbVN90yWFpvkqO5VLbWODy5-vXCbxxw         JUNGLE      200     0  fwEH2oH4lAiIywJ9Y3HGfqmzT0FhTNx4t8-mLzQmij7tJjLQ9dffLx1uApHNs0bWJdhDTdUu03OLQA         MIDDLE      200     0  654_FhvWnIgiD23Czh1KNOIK8GIVA6fjmdCV97C0GSjsiIVLsFFZCeKeoVgB7HGYMb21lhLijoN5xA         BOTTOM      200     0  16v4L_UW-xBypBUlQcp3lft8fgBIJxkj0I6H-EZn1RvOEQDEPJCkTQ-IgXzv-EXUUPJ7sg3Ko-V2RQ         UTILITY       200      0
4  OC1_564920972  4lxWvLUfy2-3mLazrziCcgV5j2rRiEURO0NWhYjdSUz8AFwVvc38Gt7QJJRCrz4J-drdNwGp2ZbfrA            TOP      100     0  3tmKJP_XGhmqeeUXByNSy8bBHpFOBE5CXDTKvgkB6FhRuxD8xi3votFwEzRsOXfFEuiUYrBL7xHKZw         JUNGLE      100     0  0OUTx_sCUhrEa5Vj4aAFJQ8a6HDZdZ--2OqJf7SYue1PnuXTddPAdDdLXs-aog-iUOTooC1HrXWkhw         MIDDLE      100     0  QMS5hVUYeKhABtUHbBNj4rlUVAXb5XkK6TcKSnZKSyQpTPCBbIxyGS5B6o6UMK4VcX1IiD5F4RRm-Q         BOTTOM      100     0  3lxuDBNhcx3x5YxMEQ8N73U2sW9YbOhYjyf5mtm638oIP7UDu6rN-V-L3Jc7UWgD5RXZGLEP81zEFg        UTILITY      100     0  BsxYlhyKMQa9NrWjdit4lLYWNrpYZyR74roM7mJot4dGogbIexXIF1VrajhemWK2B6s20SN6n32V9Q            TOP      200     1  Lu3-cqU4ohhgWtE41pu_ja4NC9X7Q0GnSe-x2QrCNCrLlEB45T1HF1z3cb7FYZxDQfOS_6d-zaZhPg         JUNGLE      200     1  YS9WiI5EQR9XgFEFY4z3smomeio0VK79K7jip_LEZmiwT9j9Q3wx4gDbxzwc2nINiv6q4NCLyo3tBw         MIDDLE      200     1  ltScfc9WsvGZXNUcPbAS9gAtiftSLHUI7ZT-4PBE6YFgLO8Iv9YJSW-X4BYRPmE4dRsN7hBl5nvwig         BOTTOM      200     1  7k_0zhr0MkeSF8P3kdpVkqPvRy89vXMhoCief4nhvu1iBwybN1hCBh7PrQzVzgH9U2txQ2hB9bkZsQ         UTILITY       200      1

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340684 entries, 0 to 340683
Data columns (total 41 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   match_id         340684 non-null  object
 1   puuid_1          340684 non-null  object
 2   teamPosition_1   340684 non-null  object
 3   teamId_1         340684 non-null  object
 4   win_1            340684 non-null  object
 5   puuid_2          340684 non-null  object
 6   teamPosition_2   340684 non-null  object
 7   teamId_2         340684 non-null  object
 8   win_2            340684 non-null  object
 9   puuid_3          340684 non-null  object
 10  teamPosition_3   340684 non-null  object
 11  teamId_3         340684 non-null  object
 12  win_3            340684 non-null  object
 13  puuid_4          340684 non-null  object
 14  teamPosition_4   340684 non-null  object
 15  teamId_4         340684 non-null  object
 16  win_4            340684 non-null  object
 17  puuid_5          340266 non-null  object
 18  teamPosition_5   340266 non-null  object
 19  teamId_5         340266 non-null  object
 20  win_5            340266 non-null  object
 21  puuid_6          336011 non-null  object
 22  teamPosition_6   336011 non-null  object
 23  teamId_6         336011 non-null  object
 24  win_6            336011 non-null  object
 25  puuid_7          336011 non-null  object
 26  teamPosition_7   336011 non-null  object
 27  teamId_7         336011 non-null  object
 28  win_7            336011 non-null  object
 29  puuid_8          336011 non-null  object
 30  teamPosition_8   336011 non-null  object
 31  teamId_8         336011 non-null  object
 32  win_8            336011 non-null  object
 33  puuid_9          335955 non-null  object
 34  teamPosition_9   335955 non-null  object
 35  teamId_9         335955 non-null  object
 36  win_9            335955 non-null  object
 37  puuid_10         332520 non-null  object
 38  teamPosition_10  332520 non-null  object
 39  teamId_10        332520 non-null  object
 40  win_10           332520 non-null  object
dtypes: object(41)
memory usage: 106.6+ MB
None

Data for match OC1_564876644:
match_id                                                                            OC1_564876644
puuid_1            UoBtQQIjzVdYoQnCE3v7kauzvPBm_UFbuVYAxFEO7NrxLx5ERnaoigWwBaqvr0S7COnlXLch6-fhqw
teamPosition_1                                                                                TOP
teamId_1                                                                                      100
win_1                                                                                           0
puuid_2            FH8BKVK4gxBd8yd3w658M_k02mdHq_8seh_8CLEbbxFxCrIChhdbU46NzHiP0mYq55dPfI4U5q8Ivg
teamPosition_2                                                                             JUNGLE
teamId_2                                                                                      100
win_2                                                                                           0
puuid_3            n7Vx94O6wlXye4RI11L7SxbEXH-sPXGnQyKTtCr7uClPzNda5199rdKQ-cR34q2W-CAmHc8OZL2SFg
teamPosition_3                                                                             MIDDLE
teamId_3                                                                                      100
win_3                                                                                           0
puuid_4            cXPc4OrBrqibrLdDhgo4WZyR_gTpu7Bs65RmiY6yupMSAn07i1xnP7DaerYZWrwwdLLTr-UAfC0alw
teamPosition_4                                                                             BOTTOM
teamId_4                                                                                      100
win_4                                                                                           0
puuid_5            NPD9MTbIW0kgE2fFwkvvudSq0_FtvLnJbON0s6zc66luaThe8HCALsAxcb9-V7WWLT6A9bBPMnQbkg
teamPosition_5                                                                            UTILITY
teamId_5                                                                                      100
win_5                                                                                           0
puuid_6            CBWHup3kaETfy6ziW2Ty3d8jMbcUmhya-UtDpBCHilp8NFOGCQKpaCULCPHkfDiQmX-30z8a7BX1mQ
teamPosition_6                                                                                TOP
teamId_6                                                                                      200
win_6                                                                                           1
puuid_7            tPJAHi7KaWwVA3Jt4ubwuArADbx7WCZukgWRgbszdvhtQTMVr24_1FWSZY89H1jTOOtiGUvb2qD9dw
teamPosition_7                                                                             JUNGLE
teamId_7                                                                                      200
win_7                                                                                           1
puuid_8            sduZVJ-zJSh9qJKqV_0wnQN0EBhNnvhFg9u0AtGqXUYFjnpcbt8BvyBHyfxo-hAt9xgS-Uivq1GtrQ
teamPosition_8                                                                             MIDDLE
teamId_8                                                                                      200
win_8                                                                                           1
puuid_9            1xLRZHol5hz1JEI18bX_7nTpEDX03vqJN3VtnncTudnGbEHLKXLvD536iUFVpoH7bcsb5mBGVW4Bcw
teamPosition_9                                                                             BOTTOM
teamId_9                                                                                      200
win_9                                                                                           1
puuid_10           3dgn7-Fwe4mYKEFjyzovEh1nteHrt7B3EZUT1RP-mZ5vRYUEkjRDlWlZT6hoD2CaqZHUrCAhsXwEsw
teamPosition_10                                                                           UTILITY
teamId_10                                                                                     200
win_10                                                                                          1
Name: 0, dtype: object

Process completed successfully.

Creating a copy of the dataframe¶

In [11]:
# Create a copy of the DataFrame to keep the original separate
df_renamed = df_match_summary.copy()

# Function to rename columns
def rename_columns(col):
    if col.startswith('teamId_'):
        player_num = int(col.split('_')[1])
        if player_num <= 5:
            return 'team_1'
        else:
            return 'team_2'
    elif col.startswith('win_'):
        return 'win'
    else:
        return col

# Renaming the columns
df_renamed.columns = [rename_columns(col) for col in df_renamed.columns]

# Displaying the first few rows of the new DataFrame
df_renamed.head()
Out[11]:
match_id puuid_1 teamPosition_1 team_1 win puuid_2 teamPosition_2 team_1 win puuid_3 teamPosition_3 team_1 win puuid_4 teamPosition_4 team_1 win puuid_5 teamPosition_5 team_1 win puuid_6 teamPosition_6 team_2 win puuid_7 teamPosition_7 team_2 win puuid_8 teamPosition_8 team_2 win puuid_9 teamPosition_9 team_2 win puuid_10 teamPosition_10 team_2 win
0 OC1_564876644 UoBtQQIjzVdYoQnCE3v7kauzvPBm_UFbuVYAxFEO7NrxLx5ERnaoigWwBaqvr0S7COnlXLch6-fhqw TOP 100 0 FH8BKVK4gxBd8yd3w658M_k02mdHq_8seh_8CLEbbxFxCrIChhdbU46NzHiP0mYq55dPfI4U5q8Ivg JUNGLE 100 0 n7Vx94O6wlXye4RI11L7SxbEXH-sPXGnQyKTtCr7uClPzNda5199rdKQ-cR34q2W-CAmHc8OZL2SFg MIDDLE 100 0 cXPc4OrBrqibrLdDhgo4WZyR_gTpu7Bs65RmiY6yupMSAn07i1xnP7DaerYZWrwwdLLTr-UAfC0alw BOTTOM 100 0 NPD9MTbIW0kgE2fFwkvvudSq0_FtvLnJbON0s6zc66luaThe8HCALsAxcb9-V7WWLT6A9bBPMnQbkg UTILITY 100 0 CBWHup3kaETfy6ziW2Ty3d8jMbcUmhya-UtDpBCHilp8NFOGCQKpaCULCPHkfDiQmX-30z8a7BX1mQ TOP 200 1 tPJAHi7KaWwVA3Jt4ubwuArADbx7WCZukgWRgbszdvhtQTMVr24_1FWSZY89H1jTOOtiGUvb2qD9dw JUNGLE 200 1 sduZVJ-zJSh9qJKqV_0wnQN0EBhNnvhFg9u0AtGqXUYFjnpcbt8BvyBHyfxo-hAt9xgS-Uivq1GtrQ MIDDLE 200 1 1xLRZHol5hz1JEI18bX_7nTpEDX03vqJN3VtnncTudnGbEHLKXLvD536iUFVpoH7bcsb5mBGVW4Bcw BOTTOM 200 1 3dgn7-Fwe4mYKEFjyzovEh1nteHrt7B3EZUT1RP-mZ5vRYUEkjRDlWlZT6hoD2CaqZHUrCAhsXwEsw UTILITY 200 1
1 OC1_564882293 slIf7LP4Kr_1_CpGyOmL_BkzGSgGLNhAos6mRON5nOxvab8d1P7o6JPuqPeYQB5NBVeiNeGJzPCjyw TOP 100 0 jxud22N5_fkeJ0jmp9krUfNgcSkYXWe9nDtFk1hO4uTRndQe2KjVI4Ey4kO-_p2mvfKsckpQYQRi2A JUNGLE 100 0 tPJAHi7KaWwVA3Jt4ubwuArADbx7WCZukgWRgbszdvhtQTMVr24_1FWSZY89H1jTOOtiGUvb2qD9dw MIDDLE 100 0 lo6YQidE22L5lMcvWrCrj5qFp7tdEa1gVHE9N6TB74HKPiwtqKagY9SofLhl5DiPMrx7v6CKxHAN7Q BOTTOM 100 0 AJl3Ebo3uqJF7aOgPcvDPAasbhtNGufLGGGnCbedT_hvX_MI0ZWgwW1PADy8P1b9MJuLS6FnGrMkMQ UTILITY 100 0 v7MdVgS5I_zXT04dcpOkqPk1AqL_G1-88gXH9Lg8u7-wIe6v6pV_YZ0awUWWs7VamaX25Y9Onb0xKw TOP 200 1 huEycrZJfGKtB_sgan83kapEoISNKxW9BOAmanMFmuMkrNX5W7g9KZELdvllV2QW5oFjxF2cxlmeVw JUNGLE 200 1 NO-0JdAO96OJkcVn9xnxizcamw6uEfwNRodM6hxyd8pKhi5IoiD9ZBf_Dr6CXh2cvDjm2Lf2lIlOpg MIDDLE 200 1 e590wZ75DqKgcBj3wJeDY_01IBkek6ggWVdy96N8XbURMggPu3_CwLQcbvCRp4Ps_BnGUu4e3NBIOQ BOTTOM 200 1 0sgU6iVda79lXfS-cdbZlT73EG5pvM6LUPZ9X5NClrRyhYqnjGdLlKkudNihZMhVOhr7PQ7uvBT13w UTILITY 200 1
2 OC1_564915288 yWgGWIoPSMAQBQUmy4Y1kOJvZJP21vpmm9e4WG735KWTWqVddo7evtnWcsp3On4aXTzVSRhQW1vPEA JUNGLE 100 1 C_FZOCg6LqBghfsji1PfM4Bruvrz_tKG-8RNVqsERrggeryB8rC8St5w3K3DkwFpSsiGVk32ru4nAg UTILITY 100 1 ABmjzLoNuG0qPBBBitzk2UH_pPYZ05qGSaV-A9IR4fe_U4t2fRBx8PNzYCwlvfMHs2a0o6b6Cb8bmg TOP 100 1 K8j8tLFucuT55DnCwx6kgnnBZcE7uYNj_nfNHiB2Axyx6XTOquOWgxZ-FBcOgvo1yQShjiyX0wm8mQ MIDDLE 100 1 8s0y4xJWaILip4jvfh2FJv09G8chFK_S7bVNEY2rBt0LNRl5d-hAS0yCX2jeK6W-15w91r59epQxFA BOTTOM 100 1 wpLwEIMeFjr7EnLQTebvOwcp9SUPa4LTFFcF8G3GrnZ5PiPix386fUswhAGNSSUCfLyK9G2rYR_eaQ JUNGLE 200 0 R1jIzEuV7NrUbtF2hi6DUt4F0V9Zf2-aMMzleN45DjpSYh_zosdiakV1xGO3tbARU34lY67SFCO3jw BOTTOM 200 0 2OAqPQHu6hIQh7wQK1BCrqptm-7rJptL0d3dBUIDX5xIVN08t7u8nq6tYwI-8C4VKGtC4ytVFYH-zw UTILITY 200 0 GXb3f35CnWvE7ripK-fmLq9rgaAAc8-ScKDOgTYKLzJ1TKy-TXXm51d307Hx03mhjMYCl0rEyEC7jg MIDDLE 200 0 YQk0vHWHy21aTrgtIoGpSobtK5mbdqmSaVmyBnK3DBfdIMNsna_QNCVpwUwFJii-UXxqrfkQ-x9fOA TOP 200 0
3 OC1_564917000 YQk0vHWHy21aTrgtIoGpSobtK5mbdqmSaVmyBnK3DBfdIMNsna_QNCVpwUwFJii-UXxqrfkQ-x9fOA TOP 100 1 R1jIzEuV7NrUbtF2hi6DUt4F0V9Zf2-aMMzleN45DjpSYh_zosdiakV1xGO3tbARU34lY67SFCO3jw JUNGLE 100 1 GXb3f35CnWvE7ripK-fmLq9rgaAAc8-ScKDOgTYKLzJ1TKy-TXXm51d307Hx03mhjMYCl0rEyEC7jg MIDDLE 100 1 wpLwEIMeFjr7EnLQTebvOwcp9SUPa4LTFFcF8G3GrnZ5PiPix386fUswhAGNSSUCfLyK9G2rYR_eaQ BOTTOM 100 1 2OAqPQHu6hIQh7wQK1BCrqptm-7rJptL0d3dBUIDX5xIVN08t7u8nq6tYwI-8C4VKGtC4ytVFYH-zw UTILITY 100 1 3QC7ggjT3H5kTiV9RbU3lzYZ7yi3fqQYf38byNdOhKWje1yu1G07SizvW5HmgtJe_fhevXOonPZ21g TOP 200 0 rQE66nEYxzGNJBukM3YlIOH3hsH1avRp1cnfDCmXoJtf5XyIbVN90yWFpvkqO5VLbWODy5-vXCbxxw JUNGLE 200 0 fwEH2oH4lAiIywJ9Y3HGfqmzT0FhTNx4t8-mLzQmij7tJjLQ9dffLx1uApHNs0bWJdhDTdUu03OLQA MIDDLE 200 0 654_FhvWnIgiD23Czh1KNOIK8GIVA6fjmdCV97C0GSjsiIVLsFFZCeKeoVgB7HGYMb21lhLijoN5xA BOTTOM 200 0 16v4L_UW-xBypBUlQcp3lft8fgBIJxkj0I6H-EZn1RvOEQDEPJCkTQ-IgXzv-EXUUPJ7sg3Ko-V2RQ UTILITY 200 0
4 OC1_564920972 4lxWvLUfy2-3mLazrziCcgV5j2rRiEURO0NWhYjdSUz8AFwVvc38Gt7QJJRCrz4J-drdNwGp2ZbfrA TOP 100 0 3tmKJP_XGhmqeeUXByNSy8bBHpFOBE5CXDTKvgkB6FhRuxD8xi3votFwEzRsOXfFEuiUYrBL7xHKZw JUNGLE 100 0 0OUTx_sCUhrEa5Vj4aAFJQ8a6HDZdZ--2OqJf7SYue1PnuXTddPAdDdLXs-aog-iUOTooC1HrXWkhw MIDDLE 100 0 QMS5hVUYeKhABtUHbBNj4rlUVAXb5XkK6TcKSnZKSyQpTPCBbIxyGS5B6o6UMK4VcX1IiD5F4RRm-Q BOTTOM 100 0 3lxuDBNhcx3x5YxMEQ8N73U2sW9YbOhYjyf5mtm638oIP7UDu6rN-V-L3Jc7UWgD5RXZGLEP81zEFg UTILITY 100 0 BsxYlhyKMQa9NrWjdit4lLYWNrpYZyR74roM7mJot4dGogbIexXIF1VrajhemWK2B6s20SN6n32V9Q TOP 200 1 Lu3-cqU4ohhgWtE41pu_ja4NC9X7Q0GnSe-x2QrCNCrLlEB45T1HF1z3cb7FYZxDQfOS_6d-zaZhPg JUNGLE 200 1 YS9WiI5EQR9XgFEFY4z3smomeio0VK79K7jip_LEZmiwT9j9Q3wx4gDbxzwc2nINiv6q4NCLyo3tBw MIDDLE 200 1 ltScfc9WsvGZXNUcPbAS9gAtiftSLHUI7ZT-4PBE6YFgLO8Iv9YJSW-X4BYRPmE4dRsN7hBl5nvwig BOTTOM 200 1 7k_0zhr0MkeSF8P3kdpVkqPvRy89vXMhoCief4nhvu1iBwybN1hCBh7PrQzVzgH9U2txQ2hB9bkZsQ UTILITY 200 1

Creating a new dataframe with a different name¶

In [12]:
# Create a new DataFrame with a different name
df_lol_matches = df_match_summary.copy()

# Function to rename columns
def rename_columns(col):
    if col.startswith('teamId_'):
        player_num = int(col.split('_')[1])
        if player_num <= 5:
            return 'team_id'
        else:
            return 'team_id'
    elif col.startswith('win_'):
        player_num = int(col.split('_')[1])
        if player_num <= 5:
            return 'team_1_win'
        else:
            return 'team_2_win'
    elif col.startswith('teamPosition_'):
        player_num = col.split('_')[1]
        return f'teamPosition_puuid_{player_num}'
    else:
        return col

# Rename the columns
df_lol_matches.columns = [rename_columns(col) for col in df_lol_matches.columns]

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

# Display the first few rows of the new DataFrame
df_lol_matches.head()
Out[12]:
match_id puuid_1 teamPosition_puuid_1 team_id team_1_win puuid_2 teamPosition_puuid_2 team_id team_1_win puuid_3 teamPosition_puuid_3 team_id team_1_win puuid_4 teamPosition_puuid_4 team_id team_1_win puuid_5 teamPosition_puuid_5 team_id team_1_win puuid_6 teamPosition_puuid_6 team_id team_2_win puuid_7 teamPosition_puuid_7 team_id team_2_win puuid_8 teamPosition_puuid_8 team_id team_2_win puuid_9 teamPosition_puuid_9 team_id team_2_win puuid_10 teamPosition_puuid_10 team_id team_2_win
0 OC1_564876644 UoBtQQIjzVdYoQnCE3v7kauzvPBm_UFbuVYAxFEO7NrxLx5ERnaoigWwBaqvr0S7COnlXLch6-fhqw TOP 100 0 FH8BKVK4gxBd8yd3w658M_k02mdHq_8seh_8CLEbbxFxCrIChhdbU46NzHiP0mYq55dPfI4U5q8Ivg JUNGLE 100 0 n7Vx94O6wlXye4RI11L7SxbEXH-sPXGnQyKTtCr7uClPzNda5199rdKQ-cR34q2W-CAmHc8OZL2SFg MIDDLE 100 0 cXPc4OrBrqibrLdDhgo4WZyR_gTpu7Bs65RmiY6yupMSAn07i1xnP7DaerYZWrwwdLLTr-UAfC0alw BOTTOM 100 0 NPD9MTbIW0kgE2fFwkvvudSq0_FtvLnJbON0s6zc66luaThe8HCALsAxcb9-V7WWLT6A9bBPMnQbkg UTILITY 100 0 CBWHup3kaETfy6ziW2Ty3d8jMbcUmhya-UtDpBCHilp8NFOGCQKpaCULCPHkfDiQmX-30z8a7BX1mQ TOP 200 1 tPJAHi7KaWwVA3Jt4ubwuArADbx7WCZukgWRgbszdvhtQTMVr24_1FWSZY89H1jTOOtiGUvb2qD9dw JUNGLE 200 1 sduZVJ-zJSh9qJKqV_0wnQN0EBhNnvhFg9u0AtGqXUYFjnpcbt8BvyBHyfxo-hAt9xgS-Uivq1GtrQ MIDDLE 200 1 1xLRZHol5hz1JEI18bX_7nTpEDX03vqJN3VtnncTudnGbEHLKXLvD536iUFVpoH7bcsb5mBGVW4Bcw BOTTOM 200 1 3dgn7-Fwe4mYKEFjyzovEh1nteHrt7B3EZUT1RP-mZ5vRYUEkjRDlWlZT6hoD2CaqZHUrCAhsXwEsw UTILITY 200 1
1 OC1_564882293 slIf7LP4Kr_1_CpGyOmL_BkzGSgGLNhAos6mRON5nOxvab8d1P7o6JPuqPeYQB5NBVeiNeGJzPCjyw TOP 100 0 jxud22N5_fkeJ0jmp9krUfNgcSkYXWe9nDtFk1hO4uTRndQe2KjVI4Ey4kO-_p2mvfKsckpQYQRi2A JUNGLE 100 0 tPJAHi7KaWwVA3Jt4ubwuArADbx7WCZukgWRgbszdvhtQTMVr24_1FWSZY89H1jTOOtiGUvb2qD9dw MIDDLE 100 0 lo6YQidE22L5lMcvWrCrj5qFp7tdEa1gVHE9N6TB74HKPiwtqKagY9SofLhl5DiPMrx7v6CKxHAN7Q BOTTOM 100 0 AJl3Ebo3uqJF7aOgPcvDPAasbhtNGufLGGGnCbedT_hvX_MI0ZWgwW1PADy8P1b9MJuLS6FnGrMkMQ UTILITY 100 0 v7MdVgS5I_zXT04dcpOkqPk1AqL_G1-88gXH9Lg8u7-wIe6v6pV_YZ0awUWWs7VamaX25Y9Onb0xKw TOP 200 1 huEycrZJfGKtB_sgan83kapEoISNKxW9BOAmanMFmuMkrNX5W7g9KZELdvllV2QW5oFjxF2cxlmeVw JUNGLE 200 1 NO-0JdAO96OJkcVn9xnxizcamw6uEfwNRodM6hxyd8pKhi5IoiD9ZBf_Dr6CXh2cvDjm2Lf2lIlOpg MIDDLE 200 1 e590wZ75DqKgcBj3wJeDY_01IBkek6ggWVdy96N8XbURMggPu3_CwLQcbvCRp4Ps_BnGUu4e3NBIOQ BOTTOM 200 1 0sgU6iVda79lXfS-cdbZlT73EG5pvM6LUPZ9X5NClrRyhYqnjGdLlKkudNihZMhVOhr7PQ7uvBT13w UTILITY 200 1
2 OC1_564915288 yWgGWIoPSMAQBQUmy4Y1kOJvZJP21vpmm9e4WG735KWTWqVddo7evtnWcsp3On4aXTzVSRhQW1vPEA JUNGLE 100 1 C_FZOCg6LqBghfsji1PfM4Bruvrz_tKG-8RNVqsERrggeryB8rC8St5w3K3DkwFpSsiGVk32ru4nAg UTILITY 100 1 ABmjzLoNuG0qPBBBitzk2UH_pPYZ05qGSaV-A9IR4fe_U4t2fRBx8PNzYCwlvfMHs2a0o6b6Cb8bmg TOP 100 1 K8j8tLFucuT55DnCwx6kgnnBZcE7uYNj_nfNHiB2Axyx6XTOquOWgxZ-FBcOgvo1yQShjiyX0wm8mQ MIDDLE 100 1 8s0y4xJWaILip4jvfh2FJv09G8chFK_S7bVNEY2rBt0LNRl5d-hAS0yCX2jeK6W-15w91r59epQxFA BOTTOM 100 1 wpLwEIMeFjr7EnLQTebvOwcp9SUPa4LTFFcF8G3GrnZ5PiPix386fUswhAGNSSUCfLyK9G2rYR_eaQ JUNGLE 200 0 R1jIzEuV7NrUbtF2hi6DUt4F0V9Zf2-aMMzleN45DjpSYh_zosdiakV1xGO3tbARU34lY67SFCO3jw BOTTOM 200 0 2OAqPQHu6hIQh7wQK1BCrqptm-7rJptL0d3dBUIDX5xIVN08t7u8nq6tYwI-8C4VKGtC4ytVFYH-zw UTILITY 200 0 GXb3f35CnWvE7ripK-fmLq9rgaAAc8-ScKDOgTYKLzJ1TKy-TXXm51d307Hx03mhjMYCl0rEyEC7jg MIDDLE 200 0 YQk0vHWHy21aTrgtIoGpSobtK5mbdqmSaVmyBnK3DBfdIMNsna_QNCVpwUwFJii-UXxqrfkQ-x9fOA TOP 200 0
3 OC1_564917000 YQk0vHWHy21aTrgtIoGpSobtK5mbdqmSaVmyBnK3DBfdIMNsna_QNCVpwUwFJii-UXxqrfkQ-x9fOA TOP 100 1 R1jIzEuV7NrUbtF2hi6DUt4F0V9Zf2-aMMzleN45DjpSYh_zosdiakV1xGO3tbARU34lY67SFCO3jw JUNGLE 100 1 GXb3f35CnWvE7ripK-fmLq9rgaAAc8-ScKDOgTYKLzJ1TKy-TXXm51d307Hx03mhjMYCl0rEyEC7jg MIDDLE 100 1 wpLwEIMeFjr7EnLQTebvOwcp9SUPa4LTFFcF8G3GrnZ5PiPix386fUswhAGNSSUCfLyK9G2rYR_eaQ BOTTOM 100 1 2OAqPQHu6hIQh7wQK1BCrqptm-7rJptL0d3dBUIDX5xIVN08t7u8nq6tYwI-8C4VKGtC4ytVFYH-zw UTILITY 100 1 3QC7ggjT3H5kTiV9RbU3lzYZ7yi3fqQYf38byNdOhKWje1yu1G07SizvW5HmgtJe_fhevXOonPZ21g TOP 200 0 rQE66nEYxzGNJBukM3YlIOH3hsH1avRp1cnfDCmXoJtf5XyIbVN90yWFpvkqO5VLbWODy5-vXCbxxw JUNGLE 200 0 fwEH2oH4lAiIywJ9Y3HGfqmzT0FhTNx4t8-mLzQmij7tJjLQ9dffLx1uApHNs0bWJdhDTdUu03OLQA MIDDLE 200 0 654_FhvWnIgiD23Czh1KNOIK8GIVA6fjmdCV97C0GSjsiIVLsFFZCeKeoVgB7HGYMb21lhLijoN5xA BOTTOM 200 0 16v4L_UW-xBypBUlQcp3lft8fgBIJxkj0I6H-EZn1RvOEQDEPJCkTQ-IgXzv-EXUUPJ7sg3Ko-V2RQ UTILITY 200 0
4 OC1_564920972 4lxWvLUfy2-3mLazrziCcgV5j2rRiEURO0NWhYjdSUz8AFwVvc38Gt7QJJRCrz4J-drdNwGp2ZbfrA TOP 100 0 3tmKJP_XGhmqeeUXByNSy8bBHpFOBE5CXDTKvgkB6FhRuxD8xi3votFwEzRsOXfFEuiUYrBL7xHKZw JUNGLE 100 0 0OUTx_sCUhrEa5Vj4aAFJQ8a6HDZdZ--2OqJf7SYue1PnuXTddPAdDdLXs-aog-iUOTooC1HrXWkhw MIDDLE 100 0 QMS5hVUYeKhABtUHbBNj4rlUVAXb5XkK6TcKSnZKSyQpTPCBbIxyGS5B6o6UMK4VcX1IiD5F4RRm-Q BOTTOM 100 0 3lxuDBNhcx3x5YxMEQ8N73U2sW9YbOhYjyf5mtm638oIP7UDu6rN-V-L3Jc7UWgD5RXZGLEP81zEFg UTILITY 100 0 BsxYlhyKMQa9NrWjdit4lLYWNrpYZyR74roM7mJot4dGogbIexXIF1VrajhemWK2B6s20SN6n32V9Q TOP 200 1 Lu3-cqU4ohhgWtE41pu_ja4NC9X7Q0GnSe-x2QrCNCrLlEB45T1HF1z3cb7FYZxDQfOS_6d-zaZhPg JUNGLE 200 1 YS9WiI5EQR9XgFEFY4z3smomeio0VK79K7jip_LEZmiwT9j9Q3wx4gDbxzwc2nINiv6q4NCLyo3tBw MIDDLE 200 1 ltScfc9WsvGZXNUcPbAS9gAtiftSLHUI7ZT-4PBE6YFgLO8Iv9YJSW-X4BYRPmE4dRsN7hBl5nvwig BOTTOM 200 1 7k_0zhr0MkeSF8P3kdpVkqPvRy89vXMhoCief4nhvu1iBwybN1hCBh7PrQzVzgH9U2txQ2hB9bkZsQ UTILITY 200 1

Filtering for puuids that occur at least 50 times¶

In [13]:
# Step 1: Count occurrences of each PUUID
puuid_counts = pd.Series()
for i in range(1, 11):
    puuid_counts = puuid_counts.add(df_lol_matches[f'puuid_{i}'].value_counts(), fill_value=0)

print("Top 10 PUUIDs by occurrence:")
print(puuid_counts.sort_values(ascending=False).head(10))

print(f"\nTotal unique PUUIDs: {len(puuid_counts)}")
print(f"PUUIDs appearing at least 50 times: {(puuid_counts >= 50).sum()}")
print(f"PUUIDs appearing less than 50 times: {(puuid_counts < 50).sum()}")

# Step 2: Get the list of PUUIDs that appear at least 50 times
frequent_puuids = puuid_counts[puuid_counts >= 50].index

# Step 3: Filter the DataFrame to keep only matches with frequent PUUIDs
def keep_frequent_matches(row):
    return all(row[f'puuid_{i}'] in frequent_puuids for i in range(1, 11))

df_lol_matches_filtered_50 = df_lol_matches[df_lol_matches.apply(keep_frequent_matches, axis=1)]

print(f"\nOriginal number of matches: {len(df_lol_matches)}")
print(f"Number of matches after filtering: {len(df_lol_matches_filtered_50)}")

# Display the first few rows of the filtered DataFrame
print("\nFirst few rows of the filtered DataFrame:")
print(df_lol_matches_filtered_50.head())

# Get info about the filtered DataFrame
print("\nInfo about the filtered DataFrame:")
print(df_lol_matches_filtered_50.info())
Top 10 PUUIDs by occurrence:
itA_C6VEO6BHUQC5_sL8dz_xqISQ0lqMvV-9n3pIJznrAH3UW7eSvsJXZ6x8cU23ppisLzfsoVRWxQ    1199.0
RuynagMA-JIAM5dyIVun5XSLXjmjy4fGvzDIYLI6MH1xeZHeyAdhPTnsR21NdzMRoiHQym9sc5R1iA    1185.0
0TNFgc7RTM3fmD9eDaXrf2nT6vctGyqUx_mNAr8JOQMVJqteMO_sa9eqx5H5vLPrWxstXitn0_bdwg    1169.0
AMX85m1E63xFJH7_kLMKXezgTvrwkJ78Vr_PZyEHuRjKSSmLOnxPWaO8CoAX2Ii9rL0amhq1mQv1lg    1143.0
bYboa730CW4FNtl0kLzq721Z-er_jjaRKcSGizqcSTmGwt2hjS9NUN_AuRPKJeoZzJTEZjrL2nfp8Q    1131.0
E7mT5cHjEUNzdYfPGdUQLNnBGX4i-MsLDo1GMsfdzH5CrXo4Mz6iKY_JUiuVkxH4AT6kTCFxqvgTEw    1120.0
UZWQvsTnG7F_xH6omzqlnMq1t_ZC8f6oE8mJRMcq0YnLX5vYYsKFyEALIM8SghhcMZh7kL4T455NAg    1110.0
le3guG2BewI75_YjKc-xNoiJpgedaY7OQTJo_uWgX3_LxsZn1rJ4g8pZpdmbttDkQmb5GEttuGHu5A    1110.0
ppiegP7S3BVBIBzXiXiOYlOPx-kNv_W_-cyDBJxaic7mqqHAt_x6AlU6YQMR01h86lEjVj8L3xcKUg    1093.0
TIOYshzp-zCafh2KanaR4X2OIWvHhxQrGuDBbDRu8jQPzMImldhgGf3DvxV5vdv53tCr1NVBebi60A    1057.0
dtype: object

Total unique PUUIDs: 284566
PUUIDs appearing at least 50 times: 8525
PUUIDs appearing less than 50 times: 276041

Original number of matches: 340684
Number of matches after filtering: 1402

First few rows of the filtered DataFrame:
            match_id                                                                         puuid_1 teamPosition_puuid_1 team_id team_1_win                                                                         puuid_2 teamPosition_puuid_2 team_id team_1_win                                                                         puuid_3 teamPosition_puuid_3 team_id team_1_win                                                                         puuid_4 teamPosition_puuid_4 team_id team_1_win                                                                         puuid_5 teamPosition_puuid_5 team_id team_1_win                                                                         puuid_6 teamPosition_puuid_6 team_id team_2_win                                                                         puuid_7 teamPosition_puuid_7 team_id team_2_win                                                                         puuid_8 teamPosition_puuid_8 team_id team_2_win                                                                         puuid_9 teamPosition_puuid_9 team_id team_2_win                                                                        puuid_10 teamPosition_puuid_10 team_id team_2_win
4416   OC1_575435102  G_Qsm6iSUtf6wuDaz_vyp1ahH49XlT7yW43fQ8qV_bsbhsxXlUdgy5uHgGh9qjTDO3V6zYppNXhx5Q               MIDDLE     100          1  57pc2Bz4bwlony11nKZYPyonvk0_O-50yk0F7ZNGgQ4KbFi1axkvR_8DvksL-JfzIjHn25E5CkdXCA                  TOP     100          1  zNfpJyfT-FrpkNaQhZ0kRAarAJKrH-4Jys81CCKmKSnL8u1YjZIxPnk9tFbH5V8BzNpEcq91O91Dfg               JUNGLE     100          1  i1MEaxolJkMpnV0_aXsvlhw_CrpmwpG5dORgayxvjgXB4Y13noz67crMQ59bzdAh7d6kZrwxq8GQwQ              UTILITY     100          1  ChQSSUwAEsD3dYbQsb9yiUc-qE6cDbBiD07jH_YFDneSYUnIe2f_6zHf6M0iTj6pxGcov3oAB5Aaig               BOTTOM     100          1  LOYYU8FQ-Am-QRFNAl5qjl8iYIuQMbq2SAfTDnqRvfosFg25OwaCiahehhzV0y5nISJQhc7WHgO8rw                  TOP     200          0  aiqAyTBGvlK-cygauqx9sd6m0HpOxRQcI3MNjYiwYOaQfd5QXcC3Zo4qPksDLK30KxvJGfTT97HXbw               MIDDLE     200          0  98JQ-4qgQefhTN7Ew2DxypPtdRS32YYb5SdWbP8k5py6dOqV_ub6K8wxWZYdm3zLSUWznegg3i2RfA               JUNGLE     200          0  tpNFG-FGuTZwJ5FxQNVFUWMVieD3zadlK73PSsqiLgZzj83G0p7TBWyAayMJsrxtW9eB1A-ltpNGnQ              UTILITY     200          0  PG9wrI_xu8QPN0Ch8EQ0e4RNYY70fjGShZEgmau8U1GpN4xlQWF-1HLGxRi0xRhy1SgO1k9z9-k9vw                BOTTOM     200          0
6495   OC1_576863881  LiySsbLxi8giQ_Od8vO1gvtlwKBwfOUNUqkk84l304oOs4clFDk6sIzV95tcBhhrOblqKKxweav1rw              UTILITY     100          0  g2pkr5zEnUNvEtZ9WMJDBUvCDIJjs5nkse1aTzyqJWSclRkFLbIqB0UvEIDsJ1WCwWvp77_bY4W8Tw               MIDDLE     100          0  RD9l0DyPGp0yBWr_EL5oXydOESoSzMqnF6dBa2DyPqLlP-S4fMAu_Um9cfFKGJ5jQI_0uG7rx7NMLQ               JUNGLE     100          0  NVzzTTP-v4T-o8QS-EEx78ZspadcFO4UKWIsYg42Ku54dDoaBNLrj46WRCIYYuYynNTJzoUp9LObYA                  TOP     100          0  ob1hxzhS7gc9B3qRFAvMNhhOSEIVUQpYlW5fepmpNsAgRgcj-n32rqPXOTpbdY1XqUKxO0Jp3vso0g               BOTTOM     100          0  tVpSOY04XX8DXktye7ORBUY-cU8ug6Rog1gEOg4JU1hXOKHHUmNfi-WZSVcuXyH3OQe_fcwT3iJKRA               BOTTOM     200          1  NgR6Mnn4ifl7yQnYauoaHqa8V7QrYTNRfu8TjACKATm0ZwR5qTZ4YeKcM5nWDbEwXAzzDvVtaxUFZw              UTILITY     200          1  WQ7Z47v8dDLscPJTLuXSHPkr0rttUxcPk8Hk18q-DsGDl3Wo4v-FufsD5ybe7x_d7kRBFId0TvmPGw               JUNGLE     200          1  7mQDLeiWuooC8LPUP2t92JEF6X7G06hHSeDFD1XexlgkOC9GIJe-NRuIW7tSUKM2X6WQW5xpsMYJYA               MIDDLE     200          1  dYzXqzdLEs03rBN2_pen16-sSAwggnbVH0JghtI1hZffqm8IYnkLOk0iwanEn8yw0kZrPPbKvXPx-g                   TOP     200          1
10952  OC1_580289443  L7kZWEXA07msnTyYpWLogCswn2_qiKvSKwGi_DIqjOcCjSRg_bXqSOtp90moDFrskyvE27QZweSh6w               JUNGLE     100          1  qH7Zv0Khz4LzrhZIgJtDoaXe7oSFrYboOa9_1eGSR_FvPFD_4trlRPSuwuIJ2tP73KDhJEjP2kRHhQ              UTILITY     100          1  v-kG6DnbQEz-4UY3R4E0s3NJIUgkIfPDdUyYzWpnA4_xnbQ5Tt_DI3oLg5Fp_-Jnhv52oe5SkUQ5kw               MIDDLE     100          1  nTbtwNeUxb5woPtWUFWZ-N30zGIK2k9RyBBWv9CTAE77LZOQoIU9R3uDYGErCiLMJWOnDLcCg-QtRA               BOTTOM     100          1  yvqfr0_lUYYoOeGyh8mpPiKp5mx9qbTF7QJ4JC0kTQvmRFF999rcqeYt5yghPgsLnMR2vwfLwxVl4g                  TOP     100          1  9OZMGzBsrNEIp6HxLbu0MIjegRYrwn3OYiTnnuL7ZBx42knc-OWWdgbJ2Mt7DJzk1ePOYbNuBnHAEg                  TOP     200          0  S8PsXst4UwnZCcIIcos0SRuIFlDIkd4nOhDDgNyjM0nhNAS4nHxN4ERfNeE41TrMb5Ykiir2Y8XWnw               JUNGLE     200          0  tIXtjQeVPQyO9MjYjBjFl55vroxxmVnBLpvJdGKITaujyTFV7tdM255zQUPPQYk5x674HjAaxLUT9g               MIDDLE     200          0  tLw53qiFlMaS1RR2bc8jB9-AXyM-bn1oClhAuk11JOon1vzpIUj7h16hU_e55TJyBGD3QL0I14KaWw               BOTTOM     200          0  535CDVoDwfrnadiTA8Ec0n4HEtBHc0Dbpz5KycxSVawfL08RdhDbJxqq-VjZSKr9Lk5iC2zKiMQ8cg               UTILITY     200          0
13682  OC1_581785061  uGalw3b1XlGhHLPzYDKTNpxW50nIzuN1dFM7jGeSj85XIGfP435blcXjgRvzxaEnc-kvTTqjW0SSQw              UTILITY     100          1  JbJrao9f5JToBD1zxY-51KbCpQquc_-vWAqdLwtdp48_ZB58meNZArw3Ze636sv_6ZJU_BjVvLMSBA               JUNGLE     100          1  dWYZwgh-RTFbRrNZFYjI5hAvvZZLHqDzht6LL8E-ONEzkFCOTx6zVa6UoMkSIWrS9V4bAcLEQErYPA               MIDDLE     100          1  8Eak5v4mk0JIqdlT2yy_jJf4M8pAfFO4HnWp8kUEybP-FuEgw6EUAyBFuylJ5vBE-GmAE6fmz--WAg                  TOP     100          1  VNl0K2PGyl7sXv9OK7BFzw-lYFLbUok6QifjWfksnXVXWpG6UKZ6aYSmF3_rWNrswVnm8boaV_R5IA               BOTTOM     100          1  IFLdxJGmBUw9hfjKfD9L5v-Dmh721D7SqCcLdcAoJ0rUFv1OcrkUJs-6UIs2-i11qt35CjGuhdMzkg               MIDDLE     200          0  3tgzrT1_ap-XJpMO2sa93LXntNDEpCfG3UQ0mjIBcOKp0Grwo0si7Mpc5nD7PDQrTwnOQgPQe_fsHw               JUNGLE     200          0  7Ot4veU3NEucWlhZtyHQ2VYAg1Z3Ftd2qXwW8GWN5JkkcGFm9qD0oHukiQcrM8Jcr13_g13XEerq5Q                  TOP     200          0  vXIsOKEEbkYubC-qT2Ssf6BGQxdrUmndETG1fk0JYzMkLnBHCY2-zWkVcs7TLMurbsmpH8X9s-WqXQ               BOTTOM     200          0  7PDZHE_Zy0RbJc1xdHX5JmCnyXB-YVgCtgeal0QFZXKZuqosGfDaA4eUv_LLV13cPFGcYMfo3Ja74g               UTILITY     200          0
14299  OC1_582131337  QjTmVJh62WyzkA4BGjdkudvM-7TPeVkgm7arBEkH2QRwSytXhWzVBmOUT0GRhjHHH-83bebX3HanxA                  TOP     100          0  IB2YRn0w9zD5ZagHB-GKwLcmF1Qe48KMxzLo7puKanPnkftvHnKmbEoEmkGSByPqn5C24HTNv9G6ag               JUNGLE     100          0  H8Z-iyHGUS53GaMyAM-cXPfv3_ETPfe3v83HLk9SGsDxZ0znhBZfcioun-YAoWXzmuiGODPbd2bugg               MIDDLE     100          0  BWbNfZ_ayvqXDLdSeUNuHBWnRVIldkWJlNpJh_Ykfs413Ec0Mp_y2qOPw8vdbfXFtzGCZ41NDMQhQA               BOTTOM     100          0  zwrStDRjTTM_vfTBavvuCia-4itxnppD8_RfzxQehVF_nZml44K_7BiLwh6qcD7c6Rzwip2j9rEzpQ              UTILITY     100          0  xBR7XNsWm-5rwuOTh87JjjWIUinreA6e8tC50gFlg54Onjgo7iz2M91fVH25DZqKnJKhMApqwVW7yA                  TOP     200          1  GGroACs5NScigG9TIJ9BugbrOUEsRoGV__v-OPZQH2Vandzp6PRAzZAtTsXvjRbEUx8Nw9wPMkM4Jg               JUNGLE     200          1  1M3scEg3VZY2zxxsA-6EfQzToaBZukoF9WzdyooYPVk-rrhoAHn9R2IqV2qYq9QTooQ5w8wP9SGsZA               MIDDLE     200          1  yH0gcf0dFcEYUBOCYbYVLBqV7Qvl0oOpwAv-PQW59S69ASRweMNgNW2E0IiisG4hc1pxN645RBRY3Q               BOTTOM     200          1  f6IvfyAJXKMr0TPgJ0NzUKUqB-oRAqQy302didbxIJGbVEkbq1-bCGv9ki4vCdmPhsb1welcVL1JkQ               UTILITY     200          1

Info about the filtered DataFrame:
<class 'pandas.core.frame.DataFrame'>
Index: 1402 entries, 4416 to 339618
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   match_id               1402 non-null   object
 1   puuid_1                1402 non-null   object
 2   teamPosition_puuid_1   1402 non-null   object
 3   team_id                1402 non-null   object
 4   team_1_win             1402 non-null   object
 5   puuid_2                1402 non-null   object
 6   teamPosition_puuid_2   1402 non-null   object
 7   team_id                1402 non-null   object
 8   team_1_win             1402 non-null   object
 9   puuid_3                1402 non-null   object
 10  teamPosition_puuid_3   1402 non-null   object
 11  team_id                1402 non-null   object
 12  team_1_win             1402 non-null   object
 13  puuid_4                1402 non-null   object
 14  teamPosition_puuid_4   1402 non-null   object
 15  team_id                1402 non-null   object
 16  team_1_win             1402 non-null   object
 17  puuid_5                1402 non-null   object
 18  teamPosition_puuid_5   1402 non-null   object
 19  team_id                1402 non-null   object
 20  team_1_win             1402 non-null   object
 21  puuid_6                1402 non-null   object
 22  teamPosition_puuid_6   1402 non-null   object
 23  team_id                1402 non-null   object
 24  team_2_win             1402 non-null   object
 25  puuid_7                1402 non-null   object
 26  teamPosition_puuid_7   1402 non-null   object
 27  team_id                1402 non-null   object
 28  team_2_win             1402 non-null   object
 29  puuid_8                1402 non-null   object
 30  teamPosition_puuid_8   1402 non-null   object
 31  team_id                1402 non-null   object
 32  team_2_win             1402 non-null   object
 33  puuid_9                1402 non-null   object
 34  teamPosition_puuid_9   1402 non-null   object
 35  team_id                1402 non-null   object
 36  team_2_win             1402 non-null   object
 37  puuid_10               1402 non-null   object
 38  teamPosition_puuid_10  1402 non-null   object
 39  team_id                1402 non-null   object
 40  team_2_win             1402 non-null   object
dtypes: object(41)
memory usage: 460.0+ KB
None

Calculating individual statistics¶

In [14]:
import numpy as np

def calculate_individual_statistics(df):
    stats = []
    
    for i in range(1, 11):
        puuid_col = f'puuid_{i}'
        position_col = f'teamPosition_puuid_{i}'
        win_col = 'team_1_win' if i <= 5 else 'team_2_win'
        
        player_stats = df[[puuid_col, position_col]].copy()
        player_stats['win'] = df[win_col].iloc[:, 0]  # Take the first column of win data
        player_stats.columns = ['puuid', 'position', 'win']
        stats.append(player_stats)
    
    all_stats = pd.concat(stats, ignore_index=True)
    
    # Convert win column to numeric
    all_stats['win'] = all_stats['win'].astype(int)
    
    # Calculate statistics
    individual_stats = all_stats.groupby(['puuid', 'position']).agg({
        'win': ['count', 'mean']
    }).reset_index()
    
    individual_stats.columns = ['puuid', 'position', 'games_played', 'win_rate']
    
    # Pivot the table to have positions as columns
    pivot_stats = individual_stats.pivot(index='puuid', columns='position', values=['games_played', 'win_rate'])
    
    # Flatten column names
    pivot_stats.columns = [f'{col[1]}_{col[0]}' for col in pivot_stats.columns]
    
    # Reset index to make puuid a column
    pivot_stats = pivot_stats.reset_index()
    
    # Fill NaN values with 0 for games_played and win_rate
    for col in pivot_stats.columns:
        if col.endswith('_games_played') or col.endswith('_win_rate'):
            pivot_stats[col] = pivot_stats[col].fillna(0)
    
    # Calculate total games and overall win rate for each player
    total_stats = all_stats.groupby('puuid').agg({
        'win': ['count', 'mean']
    }).reset_index()
    total_stats.columns = ['puuid', 'total_games', 'overall_win_rate']
    
    # Merge pivot stats with total stats
    final_stats = pd.merge(pivot_stats, total_stats, on='puuid')
    
    # Reorder columns
    position_columns = [col for col in final_stats.columns if col != 'puuid' and col not in ['total_games', 'overall_win_rate']]
    column_order = ['puuid'] + position_columns + ['total_games', 'overall_win_rate']
    final_stats = final_stats[column_order]
    
    return final_stats

# Calculate individual statistics
individual_statistics_50 = calculate_individual_statistics(df_lol_matches_filtered_50)

# Sort the dataframe by total games played
individual_statistics_50 = individual_statistics_50.sort_values('total_games', ascending=False)

# Display the first few rows of the new dataframe
print("First few rows of individual statistics:")
print(individual_statistics_50.head(20))

# Get info about the new dataframe
print("\nDataFrame Info:")
print(individual_statistics_50.info())

# Display some summary statistics
print("\nTop 10 players by total games played:")
print(individual_statistics_50[['puuid', 'total_games']].head(10))

print("\nTop 10 players by overall win rate (minimum 50 games):")
print(individual_statistics_50[individual_statistics_50['total_games'] >= 50][['puuid', 'overall_win_rate']].sort_values('overall_win_rate', ascending=False).head(10))

# Calculate and print the number of players for each role
print("\nNumber of players in each role:")
for position in ['TOP', 'JUNGLE', 'MIDDLE', 'BOTTOM', 'UTILITY']:
    players_in_position = (individual_statistics_50[f'{position}_games_played'] > 0).sum()
    print(f"Number of players who played {position}: {players_in_position}")

# Additional statistics
print("\nTotal number of unique players:")
print(len(individual_statistics_50))

print("\nAverage number of games played per player:")
print(individual_statistics_50['total_games'].mean())

print("\nMedian number of games played per player:")
print(individual_statistics_50['total_games'].median())

print("\nOverall win rate distribution:")
print(individual_statistics_50['overall_win_rate'].describe())

# Optional: Save to CSV
#individual_statistics_50.to_csv('individual_statistics_50.csv', index=False)
#print("\nStatistics saved to 'individual_statistics_50.csv'")
First few rows of individual statistics:
                                                                               puuid  BOTTOM_games_played  JUNGLE_games_played  MIDDLE_games_played  TOP_games_played  UTILITY_games_played  BOTTOM_win_rate  JUNGLE_win_rate  MIDDLE_win_rate  TOP_win_rate  UTILITY_win_rate  total_games  overall_win_rate
905   PzaOrvw-CzAK4eaNn5d-so7swCCx5pwD9N-sJoVrrFyj3BLWa5abXPF5WxTjA6MkZaM65cwgaPrwtQ                  1.0                235.0                 18.0               0.0                   6.0         0.000000         0.536170         0.388889      0.000000          0.666667          260          0.526923
108   2614d1Fw7-UV1HE4ALpI4YvHnApKnn1X1GoLYfZ7rgPUNdO8nD9MqAv34uYwqo_Lkx3nrm0jK8W0mg                124.0                  1.0                  1.0               3.0                  15.0         0.483871         0.000000         0.000000      0.333333          0.400000          144          0.465278
597   H0aRGeflcbSveCqhCNyY0qSDWQXLCXZHY9chqX4fOwfXKLIOXvkTXpeP8VZt3xY2Jhi8_wajcUUjJA                  2.0                  0.0                 80.0               6.0                  38.0         0.000000         0.000000         0.425000      0.333333          0.473684          126          0.428571
1708  oPpi9B9Ispp37iaiMH-5fGW-mBW3YaGsm-dxyIpFURbgG2TOPWED9MzX53yB-sxB21cHGFSumUpyJg                 13.0                 43.0                 20.0              22.0                  19.0         0.384615         0.604651         0.350000      0.409091          0.578947          117          0.495726
616   Hc_TRsKtT6pX-YpfeJtm1ks7iZTCoXyBtS0wqHJXKFBq_8r06xSKp2bYfJI82w25yuiZqqaOvqq2BQ                  1.0                  0.0                106.0               4.0                   1.0         0.000000         0.000000         0.537736      0.500000          1.000000          112          0.535714
782   MOz6RS84aIV9H3WtLyLYhYh7UVfgzbHON6OA4Q2bLn3BxwdKs6zBrWi7hoEWH1rnEJzW1SagMBXujw                  7.0                 87.0                  8.0               0.0                   2.0         0.428571         0.482759         0.250000      0.000000          1.000000          104          0.471154
1484  ggGc6V0L2amsOkPIZxkmVugbhVcf0KsnXoD_99URd0gDxvO1cb-JgxDqwogOANdIHaqfgqXf2u28wA                  3.0                  3.0                  6.0              80.0                   3.0         0.666667         0.333333         0.666667      0.525000          0.000000           95          0.515789
1060  UZWQvsTnG7F_xH6omzqlnMq1t_ZC8f6oE8mJRMcq0YnLX5vYYsKFyEALIM8SghhcMZh7kL4T455NAg                 20.0                  1.0                 33.0               2.0                  37.0         0.300000         0.000000         0.515152      0.500000          0.486486           93          0.451613
1566  jcidv6vpoGmkGdgUYvty-2BRe-8y2eLMfIJwI3EZt73UbLtMQoMnmImvX6_ZxCFJv5IjYNxTTuJ-9Q                 34.0                 12.0                  3.0               0.0                  32.0         0.617647         0.500000         1.000000      0.000000          0.437500           81          0.543210
1150  XT2P981ZaIQs2dcieK6agVDyvvqkEXPWTSal33EwUec_HbpN_f53hT7AUy6OWVtMA9C-W0cIs1MxHw                  0.0                  0.0                  0.0               0.0                  78.0         0.000000         0.000000         0.000000      0.000000          0.525641           78          0.525641
917   QIuzkrgiDtkTM_VwTL1E-lUPFn3Jh5KhJASSJeva_TLGX8cSgrJ20w0Y94zvmCg5F4wD7xPNaNgthQ                  4.0                  4.0                 34.0              31.0                   1.0         0.500000         0.250000         0.558824      0.709677          1.000000           74          0.608108
581   GLj5bz5O8Z5x6qDRPTf2GWOqHprZE4W9M1fIbdTvhDE_4_NrUDuFbRsV2eM7KQ1_Ap97avJxARic7Q                  1.0                 12.0                  2.0              56.0                   1.0         0.000000         0.500000         0.000000      0.339286          1.000000           72          0.361111
1721  ogzygEBEpss6yjd4x7veDAqn6GoTkU4xN1emdv--GOnkId28nhGvAm7Oq6RAPTlCaTYRd8hx520rqg                  1.0                  4.0                  5.0              57.0                   4.0         1.000000         1.000000         0.800000      0.543860          0.500000           71          0.591549
816   NR_KG3_OQziQ7Ad9qUF0poSroauFtze64soBRSheH6xZVKiPIRA3yeoC1Cuv9oJX6WkWw_ZronVXWg                 46.0                  3.0                  8.0               3.0                   1.0         0.565217         0.666667         0.500000      0.000000          1.000000           61          0.540984
1927  v-F8VoovrhEUNy2QwZk6Vou0RHPEMyQCBtJRonV8N50-j9sfIxQVimiVOizCsyKEaRYMAf7Fn2p6tg                  0.0                  0.0                  3.0              57.0                   0.0         0.000000         0.000000         0.000000      0.596491          0.000000           60          0.566667
1063  UbIMDjlFAwcaZFAcwXOpbzV9Xx2pj8DKuJL2qJLZx0oFJ_iRltIGRk8EmIwxf5_ccJJBAb6BL1PcEg                 10.0                 49.0                  0.0               0.0                   0.0         0.700000         0.489796         0.000000      0.000000          0.000000           59          0.525424
1654  mf2Ytih03-qDRFTq6D2ZGz5g_N-LeOrdGdU5LIQ4rUiXHkzMUGi6ZG0l-P9t5_wSEgvbm9OB-YrpZg                  1.0                  1.0                  5.0              52.0                   0.0         1.000000         0.000000         0.600000      0.384615          0.000000           59          0.406780
1740  pDwhqYyxc9lgdf-Xj_nh-Y3PwrFh-QBXkOytErZ0HssjuoMU_dE52IJS3mdWeIN0x9qEpmagWfzAuw                 22.0                 12.0                  5.0              10.0                   9.0         0.409091         0.333333         0.400000      0.300000          0.333333           58          0.362069
2051  ycQuMhKs31pqW_e0C6GtrHSgtw10tdiMmhvKUkHwhztQx1Nm4G73c7DQpXDmY6w__xWGET5-7ongww                 45.0                  2.0                  0.0               0.0                  10.0         0.577778         0.000000         0.000000      0.000000          0.500000           57          0.543860
1569  jeSUwUB2SUxpfHYGDkUahTx8hVLmQz-6AHlrIR0y-nXdyvQD1QUQ3tLlVQJAzdM2cT0JMD1_s6uUcg                 10.0                 13.0                  6.0              20.0                   7.0         0.300000         0.538462         0.500000      0.500000          0.857143           56          0.517857

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2092 entries, 905 to 2091
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   puuid                 2092 non-null   object 
 1   BOTTOM_games_played   2092 non-null   float64
 2   JUNGLE_games_played   2092 non-null   float64
 3   MIDDLE_games_played   2092 non-null   float64
 4   TOP_games_played      2092 non-null   float64
 5   UTILITY_games_played  2092 non-null   float64
 6   BOTTOM_win_rate       2092 non-null   float64
 7   JUNGLE_win_rate       2092 non-null   float64
 8   MIDDLE_win_rate       2092 non-null   float64
 9   TOP_win_rate          2092 non-null   float64
 10  UTILITY_win_rate      2092 non-null   float64
 11  total_games           2092 non-null   int64  
 12  overall_win_rate      2092 non-null   float64
dtypes: float64(11), int64(1), object(1)
memory usage: 228.8+ KB
None

Top 10 players by total games played:
                                                                               puuid  total_games
905   PzaOrvw-CzAK4eaNn5d-so7swCCx5pwD9N-sJoVrrFyj3BLWa5abXPF5WxTjA6MkZaM65cwgaPrwtQ          260
108   2614d1Fw7-UV1HE4ALpI4YvHnApKnn1X1GoLYfZ7rgPUNdO8nD9MqAv34uYwqo_Lkx3nrm0jK8W0mg          144
597   H0aRGeflcbSveCqhCNyY0qSDWQXLCXZHY9chqX4fOwfXKLIOXvkTXpeP8VZt3xY2Jhi8_wajcUUjJA          126
1708  oPpi9B9Ispp37iaiMH-5fGW-mBW3YaGsm-dxyIpFURbgG2TOPWED9MzX53yB-sxB21cHGFSumUpyJg          117
616   Hc_TRsKtT6pX-YpfeJtm1ks7iZTCoXyBtS0wqHJXKFBq_8r06xSKp2bYfJI82w25yuiZqqaOvqq2BQ          112
782   MOz6RS84aIV9H3WtLyLYhYh7UVfgzbHON6OA4Q2bLn3BxwdKs6zBrWi7hoEWH1rnEJzW1SagMBXujw          104
1484  ggGc6V0L2amsOkPIZxkmVugbhVcf0KsnXoD_99URd0gDxvO1cb-JgxDqwogOANdIHaqfgqXf2u28wA           95
1060  UZWQvsTnG7F_xH6omzqlnMq1t_ZC8f6oE8mJRMcq0YnLX5vYYsKFyEALIM8SghhcMZh7kL4T455NAg           93
1566  jcidv6vpoGmkGdgUYvty-2BRe-8y2eLMfIJwI3EZt73UbLtMQoMnmImvX6_ZxCFJv5IjYNxTTuJ-9Q           81
1150  XT2P981ZaIQs2dcieK6agVDyvvqkEXPWTSal33EwUec_HbpN_f53hT7AUy6OWVtMA9C-W0cIs1MxHw           78

Top 10 players by overall win rate (minimum 50 games):
                                                                               puuid  overall_win_rate
917   QIuzkrgiDtkTM_VwTL1E-lUPFn3Jh5KhJASSJeva_TLGX8cSgrJ20w0Y94zvmCg5F4wD7xPNaNgthQ          0.608108
195   4zmhox-1mypAYZ14DoB1iI7Uj4LZVIWGLDPA3xVsOLnY_pBvVK3A3Z2B0uQOOCBSY0OptNgWcbv1xA          0.600000
1721  ogzygEBEpss6yjd4x7veDAqn6GoTkU4xN1emdv--GOnkId28nhGvAm7Oq6RAPTlCaTYRd8hx520rqg          0.591549
1901  tttHBY64MQpftzlwZJFVURRgU1Ll4Elrz_c1HTNVWSXdfvQwHUviQkutfeUrf06aFN8SBeyTGm2KIA          0.584906
1927  v-F8VoovrhEUNy2QwZk6Vou0RHPEMyQCBtJRonV8N50-j9sfIxQVimiVOizCsyKEaRYMAf7Fn2p6tg          0.566667
2051  ycQuMhKs31pqW_e0C6GtrHSgtw10tdiMmhvKUkHwhztQx1Nm4G73c7DQpXDmY6w__xWGET5-7ongww          0.543860
1566  jcidv6vpoGmkGdgUYvty-2BRe-8y2eLMfIJwI3EZt73UbLtMQoMnmImvX6_ZxCFJv5IjYNxTTuJ-9Q          0.543210
816   NR_KG3_OQziQ7Ad9qUF0poSroauFtze64soBRSheH6xZVKiPIRA3yeoC1Cuv9oJX6WkWw_ZronVXWg          0.540984
616   Hc_TRsKtT6pX-YpfeJtm1ks7iZTCoXyBtS0wqHJXKFBq_8r06xSKp2bYfJI82w25yuiZqqaOvqq2BQ          0.535714
6     -D5VyOXAN1LGD4i5QIRIBproOnUlKKkoyDR8gZdkOUwNmfQ9lEHDqVRUlJSjDsx5PQYUrsr3JZH-cA          0.529412

Number of players in each role:
Number of players who played TOP: 721
Number of players who played JUNGLE: 685
Number of players who played MIDDLE: 769
Number of players who played BOTTOM: 727
Number of players who played UTILITY: 723

Total number of unique players:
2092

Average number of games played per player:
6.701720841300191

Median number of games played per player:
2.0

Overall win rate distribution:
count    2092.000000
mean        0.492652
std         0.375748
min         0.000000
25%         0.000000
50%         0.500000
75%         1.000000
max         1.000000
Name: overall_win_rate, dtype: float64
In [10]:
individual_statistics_50
Out[10]:
puuid BOTTOM_games_played JUNGLE_games_played MIDDLE_games_played TOP_games_played UTILITY_games_played BOTTOM_win_rate JUNGLE_win_rate MIDDLE_win_rate TOP_win_rate UTILITY_win_rate total_games overall_win_rate
905 PzaOrvw-CzAK4eaNn5d-so7swCCx5pwD9N-sJoVrrFyj3BLWa5abXPF5WxTjA6MkZaM65cwgaPrwtQ 1.0 235.0 18.0 0.0 6.0 0.000000 0.536170 0.388889 0.000000 0.666667 260 0.526923
108 2614d1Fw7-UV1HE4ALpI4YvHnApKnn1X1GoLYfZ7rgPUNdO8nD9MqAv34uYwqo_Lkx3nrm0jK8W0mg 124.0 1.0 1.0 3.0 15.0 0.483871 0.000000 0.000000 0.333333 0.400000 144 0.465278
597 H0aRGeflcbSveCqhCNyY0qSDWQXLCXZHY9chqX4fOwfXKLIOXvkTXpeP8VZt3xY2Jhi8_wajcUUjJA 2.0 0.0 80.0 6.0 38.0 0.000000 0.000000 0.425000 0.333333 0.473684 126 0.428571
1708 oPpi9B9Ispp37iaiMH-5fGW-mBW3YaGsm-dxyIpFURbgG2TOPWED9MzX53yB-sxB21cHGFSumUpyJg 13.0 43.0 20.0 22.0 19.0 0.384615 0.604651 0.350000 0.409091 0.578947 117 0.495726
616 Hc_TRsKtT6pX-YpfeJtm1ks7iZTCoXyBtS0wqHJXKFBq_8r06xSKp2bYfJI82w25yuiZqqaOvqq2BQ 1.0 0.0 106.0 4.0 1.0 0.000000 0.000000 0.537736 0.500000 1.000000 112 0.535714
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1008 SoFo6vs5EuxQ4AJ7YfgR2o7aXiBw9nPIOtD51DsbUfUt9YHD9cQDLQL6DT-0KFVB9CV5Jkotqy2bSw 0.0 0.0 1.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1 0.000000
1007 SoAa5xeK65tEmRXmtwic6PAIOnEJUT94ALj0HpAq5aMd9fHIEcqtefnxv-AqSgfWwC40ixqjp1Xg6w 1.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1 0.000000
1005 SlA0J3w7dXkeAHbgO5d-Df-W6-jobFeLMMx0ffV07CvBD3N6mnEahE8yZ9G_TzBj7PJcnVj63v1qWQ 0.0 0.0 0.0 0.0 1.0 0.000000 0.000000 0.000000 0.000000 0.000000 1 0.000000
1003 SgR4KjLgzwh77SFQh0QDry9C1amoeyQddfvwUBuYtwiJCNbiW_ZOKa1WJuxydvQq4xuHZtrBccd4iQ 0.0 0.0 0.0 0.0 1.0 0.000000 0.000000 0.000000 0.000000 1.000000 1 1.000000
2091 zwrStDRjTTM_vfTBavvuCia-4itxnppD8_RfzxQehVF_nZml44K_7BiLwh6qcD7c6Rzwip2j9rEzpQ 0.0 0.0 0.0 0.0 1.0 0.000000 0.000000 0.000000 0.000000 0.000000 1 0.000000

2092 rows × 13 columns

Calculating team statistics¶

In [15]:
def calculate_team_statistics(df_matches, df_individual):
    team_stats = []
    
    for _, match in df_matches.iterrows():
        for i in range(1, 11):
            puuid = match[f'puuid_{i}']
            team = 1 if i <= 5 else 2
            
            # Get teammates and opponents
            teammates = [match[f'puuid_{j}'] for j in range(1, 11) if (j <= 5) == (i <= 5) and j != i]
            opponents = [match[f'puuid_{j}'] for j in range(1, 11) if (j <= 5) != (i <= 5)]
            
            # Get stats for teammates and opponents
            teammate_stats = df_individual[df_individual['puuid'].isin(teammates)]
            opponent_stats = df_individual[df_individual['puuid'].isin(opponents)]
            
            # Calculate average stats
            avg_teammate_stats = teammate_stats[['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                                                 'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate',
                                                 'total_games', 'overall_win_rate']].mean()
            
            avg_opponent_stats = opponent_stats[['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                                                 'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate',
                                                 'total_games', 'overall_win_rate']].mean()
            
            # Rename columns
            avg_teammate_stats = avg_teammate_stats.add_prefix('teammate_avg_')
            avg_opponent_stats = avg_opponent_stats.add_prefix('opponent_avg_')
            
            # Combine stats
            combined_stats = pd.concat([pd.Series({'puuid': puuid}), avg_teammate_stats, avg_opponent_stats])
            team_stats.append(combined_stats)
    
    team_stats_df = pd.DataFrame(team_stats)
    
    # Group by puuid and calculate mean
    final_team_stats = team_stats_df.groupby('puuid').mean().reset_index()
    
    return final_team_stats

# Calculate team statistics
team_statistics_50 = calculate_team_statistics(df_lol_matches_filtered_50, individual_statistics_50)

# Merge with individual statistics
final_statistics_50 = pd.merge(individual_statistics_50, team_statistics_50, on='puuid')

# Display the first few rows of the new dataframe
print("First few rows of final statistics:")
print(final_statistics_50.head())

# Get info about the new dataframe
print("\nDataFrame Info:")
print(final_statistics_50.info())

# Save to CSV
final_statistics_50.to_csv('final_statistics_50.csv', index=False)
print("\nStatistics saved to 'final_statistics_50.csv'")

# Additional summary statistics
print("\nNumber of players in final statistics:")
print(len(final_statistics_50))

print("\nAverage number of games played per player:")
print(final_statistics_50['total_games'].mean())

print("\nMedian number of games played per player:")
print(final_statistics_50['total_games'].median())

print("\nDistribution of games played:")
print(final_statistics_50['total_games'].describe())

print("\nAverage teammate overall win rate:")
print(final_statistics_50['teammate_avg_overall_win_rate'].mean())

print("\nAverage opponent overall win rate:")
print(final_statistics_50['opponent_avg_overall_win_rate'].mean())

# Top players by total games
print("\nTop 10 players by total games played:")
print(final_statistics_50.nlargest(10, 'total_games')[['puuid', 'total_games']])

# Top players by win rate (minimum 100 games)
high_volume_players = final_statistics_50[final_statistics_50['total_games'] >= 100]
print("\nTop 10 high-volume players (100+ games) by win rate:")
print(high_volume_players.nlargest(10, 'overall_win_rate')[['puuid', 'total_games', 'overall_win_rate']])
First few rows of final statistics:
                                                                            puuid  BOTTOM_games_played  JUNGLE_games_played  MIDDLE_games_played  TOP_games_played  UTILITY_games_played  BOTTOM_win_rate  JUNGLE_win_rate  MIDDLE_win_rate  TOP_win_rate  UTILITY_win_rate  total_games  overall_win_rate  teammate_avg_TOP_games_played  teammate_avg_JUNGLE_games_played  teammate_avg_MIDDLE_games_played  teammate_avg_BOTTOM_games_played  teammate_avg_UTILITY_games_played  teammate_avg_TOP_win_rate  teammate_avg_JUNGLE_win_rate  teammate_avg_MIDDLE_win_rate  teammate_avg_BOTTOM_win_rate  teammate_avg_UTILITY_win_rate  teammate_avg_total_games  teammate_avg_overall_win_rate  opponent_avg_TOP_games_played  opponent_avg_JUNGLE_games_played  opponent_avg_MIDDLE_games_played  opponent_avg_BOTTOM_games_played  opponent_avg_UTILITY_games_played  opponent_avg_TOP_win_rate  opponent_avg_JUNGLE_win_rate  opponent_avg_MIDDLE_win_rate  opponent_avg_BOTTOM_win_rate  opponent_avg_UTILITY_win_rate  opponent_avg_total_games  opponent_avg_overall_win_rate
0  PzaOrvw-CzAK4eaNn5d-so7swCCx5pwD9N-sJoVrrFyj3BLWa5abXPF5WxTjA6MkZaM65cwgaPrwtQ                  1.0                235.0                 18.0               0.0                   6.0         0.000000         0.536170         0.388889      0.000000          0.666667          260          0.526923                       7.512500                          3.269231                          6.180769                          7.427885                           6.341346                   0.282205                      0.226809                      0.341659                      0.323300                       0.312858                 30.731731                       0.509404                       7.278462                          5.916154                          6.633846                          6.278462                           5.459231                   0.261894                      0.276024                      0.333557                      0.307605                       0.315497                 31.566154                       0.507218
1  2614d1Fw7-UV1HE4ALpI4YvHnApKnn1X1GoLYfZ7rgPUNdO8nD9MqAv34uYwqo_Lkx3nrm0jK8W0mg                124.0                  1.0                  1.0               3.0                  15.0         0.483871         0.000000         0.000000      0.333333          0.400000          144          0.465278                       8.512153                         10.083333                          6.989583                          3.263889                           5.722222                   0.291860                      0.271317                      0.365129                      0.249143                       0.312629                 34.571181                       0.510198                       6.834722                          5.909722                          6.270833                          5.923611                           5.712500                   0.256418                      0.268402                      0.353182                      0.302870                       0.321966                 30.651389                       0.504959
2  H0aRGeflcbSveCqhCNyY0qSDWQXLCXZHY9chqX4fOwfXKLIOXvkTXpeP8VZt3xY2Jhi8_wajcUUjJA                  2.0                  0.0                 80.0               6.0                  38.0         0.000000         0.000000         0.425000      0.333333          0.473684          126          0.428571                       6.617063                          9.434524                          4.412698                          6.210317                           5.253968                   0.274838                      0.290610                      0.308358                      0.292511                       0.286963                 31.928571                       0.504284                       7.136508                          8.255556                          7.366667                          5.938095                           5.842857                   0.277010                      0.285576                      0.354699                      0.270991                       0.330912                 34.539683                       0.519247
3  oPpi9B9Ispp37iaiMH-5fGW-mBW3YaGsm-dxyIpFURbgG2TOPWED9MzX53yB-sxB21cHGFSumUpyJg                 13.0                 43.0                 20.0              22.0                  19.0         0.384615         0.604651         0.350000      0.409091          0.578947          117          0.495726                       4.880342                          4.585470                          5.373932                          5.425214                           4.820513                   0.288712                      0.229256                      0.318798                      0.272216                       0.274329                 25.085470                       0.493220                       4.705983                          5.695726                          4.864957                          4.200000                           4.482051                   0.260690                      0.233276                      0.296687                      0.257088                       0.284257                 23.948718                       0.480400
4  Hc_TRsKtT6pX-YpfeJtm1ks7iZTCoXyBtS0wqHJXKFBq_8r06xSKp2bYfJI82w25yuiZqqaOvqq2BQ                  1.0                  0.0                106.0               4.0                   1.0         0.000000         0.000000         0.537736      0.500000          1.000000          112          0.535714                       8.116071                         10.366071                          4.877232                          8.029018                           6.406250                   0.257865                      0.287532                      0.354456                      0.312866                       0.287025                 37.794643                       0.508370                       7.244643                         12.580357                          7.758929                          7.667857                           6.546429                   0.247338                      0.280110                      0.348497                      0.288221                       0.330283                 41.798214                       0.508157

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2092 entries, 0 to 2091
Data columns (total 37 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   puuid                              2092 non-null   object 
 1   BOTTOM_games_played                2092 non-null   float64
 2   JUNGLE_games_played                2092 non-null   float64
 3   MIDDLE_games_played                2092 non-null   float64
 4   TOP_games_played                   2092 non-null   float64
 5   UTILITY_games_played               2092 non-null   float64
 6   BOTTOM_win_rate                    2092 non-null   float64
 7   JUNGLE_win_rate                    2092 non-null   float64
 8   MIDDLE_win_rate                    2092 non-null   float64
 9   TOP_win_rate                       2092 non-null   float64
 10  UTILITY_win_rate                   2092 non-null   float64
 11  total_games                        2092 non-null   int64  
 12  overall_win_rate                   2092 non-null   float64
 13  teammate_avg_TOP_games_played      2092 non-null   float64
 14  teammate_avg_JUNGLE_games_played   2092 non-null   float64
 15  teammate_avg_MIDDLE_games_played   2092 non-null   float64
 16  teammate_avg_BOTTOM_games_played   2092 non-null   float64
 17  teammate_avg_UTILITY_games_played  2092 non-null   float64
 18  teammate_avg_TOP_win_rate          2092 non-null   float64
 19  teammate_avg_JUNGLE_win_rate       2092 non-null   float64
 20  teammate_avg_MIDDLE_win_rate       2092 non-null   float64
 21  teammate_avg_BOTTOM_win_rate       2092 non-null   float64
 22  teammate_avg_UTILITY_win_rate      2092 non-null   float64
 23  teammate_avg_total_games           2092 non-null   float64
 24  teammate_avg_overall_win_rate      2092 non-null   float64
 25  opponent_avg_TOP_games_played      2092 non-null   float64
 26  opponent_avg_JUNGLE_games_played   2092 non-null   float64
 27  opponent_avg_MIDDLE_games_played   2092 non-null   float64
 28  opponent_avg_BOTTOM_games_played   2092 non-null   float64
 29  opponent_avg_UTILITY_games_played  2092 non-null   float64
 30  opponent_avg_TOP_win_rate          2092 non-null   float64
 31  opponent_avg_JUNGLE_win_rate       2092 non-null   float64
 32  opponent_avg_MIDDLE_win_rate       2092 non-null   float64
 33  opponent_avg_BOTTOM_win_rate       2092 non-null   float64
 34  opponent_avg_UTILITY_win_rate      2092 non-null   float64
 35  opponent_avg_total_games           2092 non-null   float64
 36  opponent_avg_overall_win_rate      2092 non-null   float64
dtypes: float64(35), int64(1), object(1)
memory usage: 604.8+ KB
None

Statistics saved to 'final_statistics_50.csv'

Number of players in final statistics:
2092

Average number of games played per player:
6.701720841300191

Median number of games played per player:
2.0

Distribution of games played:
count    2092.000000
mean        6.701721
std        12.767424
min         1.000000
25%         1.000000
50%         2.000000
75%         7.000000
max       260.000000
Name: total_games, dtype: float64

Average teammate overall win rate:
0.49598813577436307

Average opponent overall win rate:
0.49792852323710823

Top 10 players by total games played:
                                                                            puuid  total_games
0  PzaOrvw-CzAK4eaNn5d-so7swCCx5pwD9N-sJoVrrFyj3BLWa5abXPF5WxTjA6MkZaM65cwgaPrwtQ          260
1  2614d1Fw7-UV1HE4ALpI4YvHnApKnn1X1GoLYfZ7rgPUNdO8nD9MqAv34uYwqo_Lkx3nrm0jK8W0mg          144
2  H0aRGeflcbSveCqhCNyY0qSDWQXLCXZHY9chqX4fOwfXKLIOXvkTXpeP8VZt3xY2Jhi8_wajcUUjJA          126
3  oPpi9B9Ispp37iaiMH-5fGW-mBW3YaGsm-dxyIpFURbgG2TOPWED9MzX53yB-sxB21cHGFSumUpyJg          117
4  Hc_TRsKtT6pX-YpfeJtm1ks7iZTCoXyBtS0wqHJXKFBq_8r06xSKp2bYfJI82w25yuiZqqaOvqq2BQ          112
5  MOz6RS84aIV9H3WtLyLYhYh7UVfgzbHON6OA4Q2bLn3BxwdKs6zBrWi7hoEWH1rnEJzW1SagMBXujw          104
6  ggGc6V0L2amsOkPIZxkmVugbhVcf0KsnXoD_99URd0gDxvO1cb-JgxDqwogOANdIHaqfgqXf2u28wA           95
7  UZWQvsTnG7F_xH6omzqlnMq1t_ZC8f6oE8mJRMcq0YnLX5vYYsKFyEALIM8SghhcMZh7kL4T455NAg           93
8  jcidv6vpoGmkGdgUYvty-2BRe-8y2eLMfIJwI3EZt73UbLtMQoMnmImvX6_ZxCFJv5IjYNxTTuJ-9Q           81
9  XT2P981ZaIQs2dcieK6agVDyvvqkEXPWTSal33EwUec_HbpN_f53hT7AUy6OWVtMA9C-W0cIs1MxHw           78

Top 10 high-volume players (100+ games) by win rate:
                                                                            puuid  total_games  overall_win_rate
4  Hc_TRsKtT6pX-YpfeJtm1ks7iZTCoXyBtS0wqHJXKFBq_8r06xSKp2bYfJI82w25yuiZqqaOvqq2BQ          112          0.535714
0  PzaOrvw-CzAK4eaNn5d-so7swCCx5pwD9N-sJoVrrFyj3BLWa5abXPF5WxTjA6MkZaM65cwgaPrwtQ          260          0.526923
3  oPpi9B9Ispp37iaiMH-5fGW-mBW3YaGsm-dxyIpFURbgG2TOPWED9MzX53yB-sxB21cHGFSumUpyJg          117          0.495726
5  MOz6RS84aIV9H3WtLyLYhYh7UVfgzbHON6OA4Q2bLn3BxwdKs6zBrWi7hoEWH1rnEJzW1SagMBXujw          104          0.471154
1  2614d1Fw7-UV1HE4ALpI4YvHnApKnn1X1GoLYfZ7rgPUNdO8nD9MqAv34uYwqo_Lkx3nrm0jK8W0mg          144          0.465278
2  H0aRGeflcbSveCqhCNyY0qSDWQXLCXZHY9chqX4fOwfXKLIOXvkTXpeP8VZt3xY2Jhi8_wajcUUjJA          126          0.428571
In [12]:
team_statistics_50
Out[12]:
puuid teammate_avg_TOP_games_played teammate_avg_JUNGLE_games_played teammate_avg_MIDDLE_games_played teammate_avg_BOTTOM_games_played teammate_avg_UTILITY_games_played teammate_avg_TOP_win_rate teammate_avg_JUNGLE_win_rate teammate_avg_MIDDLE_win_rate teammate_avg_BOTTOM_win_rate teammate_avg_UTILITY_win_rate teammate_avg_total_games teammate_avg_overall_win_rate opponent_avg_TOP_games_played opponent_avg_JUNGLE_games_played opponent_avg_MIDDLE_games_played opponent_avg_BOTTOM_games_played opponent_avg_UTILITY_games_played opponent_avg_TOP_win_rate opponent_avg_JUNGLE_win_rate opponent_avg_MIDDLE_win_rate opponent_avg_BOTTOM_win_rate opponent_avg_UTILITY_win_rate opponent_avg_total_games opponent_avg_overall_win_rate
0 -64wlXg5mXgPBTMFWEyYZ2uhYPZKESnUU_2i0kXptpELAhzZC5FRoivFLvuzmIaj6mOIaZE13cu7sw 4.000000 1.375000 4.625000 3.250000 5.000000 0.230655 0.400000 0.236905 0.421212 0.131944 18.250000 0.492063 2.400000 3.500000 3.300000 5.700000 1.600000 0.267857 0.299740 0.379524 0.328594 0.362500 16.500000 0.511259
1 -7_he7CgfDa6u6ZxZVSDlk-T-K1xPP22n92Ai5ikBZ2--PU-at7qAxFLiZN7VUyoFN2iWVsHs_vXxQ 5.375000 14.312500 6.156250 4.500000 5.031250 0.263144 0.191847 0.319355 0.325212 0.365333 35.375000 0.453944 4.425000 6.500000 3.100000 5.025000 4.700000 0.303011 0.261213 0.369788 0.349547 0.224952 23.750000 0.521039
2 -7ukEdWtvkdGVO0O7cOF69PgeyjE2NVEGMjk4SBSQeaNtIsJtPUlIqyTJOAZYo_AjC4pQIWBoR1KIQ 8.152174 15.293478 9.326087 6.663043 6.402174 0.275563 0.228747 0.366942 0.336186 0.369174 45.836957 0.535180 8.260870 17.634783 4.930435 5.321739 6.304348 0.277608 0.315553 0.346748 0.267297 0.331129 42.452174 0.503233
3 -8afnbq9mE-wro2NMhEdqwp6dQp6JizDHIqnA2ybMnNIBqJ5tZY_6OP2ZXg8Rq8n0BuASEzpDePA8Q 3.583333 0.750000 0.750000 1.500000 2.083333 0.273232 0.083333 0.233333 0.193056 0.064394 8.666667 0.473710 0.666667 0.733333 1.866667 1.800000 5.933333 0.122222 0.316667 0.162222 0.167806 0.190598 11.000000 0.351179
4 -C0UiC_P7YCI-licbR0xZCd-N1ba6jpRBUZ4l9lTQX61LzUzXivfCuQ9fu35OxmRKx-pmWt7rHEMwA 5.053571 17.892857 6.821429 4.714286 4.357143 0.212401 0.268637 0.273453 0.285091 0.245658 38.839286 0.452909 4.714286 5.114286 6.042857 5.142857 7.000000 0.280149 0.292019 0.198255 0.286170 0.259857 28.014286 0.543853
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2087 zpMR0IyJXQiG1XJieUB1HudZymMmI7Rs32fa2A1WQfeE7Ob3TebzHjOo06Ja2EvqXs39VHafFM43Pw 9.593750 15.609375 11.468750 6.968750 3.562500 0.294409 0.283392 0.433675 0.285376 0.354664 47.203125 0.520071 8.020833 13.620833 9.045833 8.408333 6.454167 0.260491 0.306952 0.346297 0.287259 0.356227 45.550000 0.511398
2088 zqszyKqXvQ7Pf0wPkfTKV4DMKOuEPXC8Rlup4y4HBZB0OVxa0R11_ReAzjgsJLhrUFSFwk7NgeWjnA 0.000000 0.250000 0.250000 0.500000 0.250000 0.000000 0.000000 0.000000 0.000000 0.000000 1.250000 0.000000 0.400000 0.200000 0.400000 0.200000 0.200000 0.400000 0.200000 0.200000 0.200000 0.200000 1.400000 1.000000
2089 zwI3H0XFbg2LBokpaY9Y2UHDvqxYSiteT6XlcQraMxz14yS7euHOlrNLM60jByE5_1eCxk4tE5uZjQ 7.500000 20.312500 4.031250 6.250000 3.281250 0.243428 0.371074 0.284803 0.319169 0.245164 41.375000 0.513606 5.500000 9.750000 7.675000 5.225000 7.050000 0.293449 0.359259 0.284719 0.268515 0.254720 35.200000 0.491138
2090 zwQ333hBGzXIxG1LjJEba62P1OghUJ5R3cbJVjsMqnjGWjynMZMOLSWaUw-dY7DRnXtkbOOb9N5CfQ 2.166667 4.208333 2.583333 4.791667 5.333333 0.326389 0.179337 0.152913 0.350816 0.198974 19.083333 0.522607 3.700000 4.733333 8.100000 4.133333 6.133333 0.305866 0.218211 0.340349 0.292255 0.276568 26.800000 0.486522
2091 zwrStDRjTTM_vfTBavvuCia-4itxnppD8_RfzxQehVF_nZml44K_7BiLwh6qcD7c6Rzwip2j9rEzpQ 0.250000 0.500000 0.500000 0.500000 0.000000 0.000000 0.125000 0.250000 0.000000 0.000000 1.750000 0.125000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 1.000000 1.000000

2092 rows × 25 columns

In [14]:
final_statistics_50
Out[14]:
puuid BOTTOM_games_played JUNGLE_games_played MIDDLE_games_played TOP_games_played UTILITY_games_played BOTTOM_win_rate JUNGLE_win_rate MIDDLE_win_rate TOP_win_rate UTILITY_win_rate total_games overall_win_rate teammate_avg_TOP_games_played teammate_avg_JUNGLE_games_played teammate_avg_MIDDLE_games_played teammate_avg_BOTTOM_games_played teammate_avg_UTILITY_games_played teammate_avg_TOP_win_rate teammate_avg_JUNGLE_win_rate teammate_avg_MIDDLE_win_rate teammate_avg_BOTTOM_win_rate teammate_avg_UTILITY_win_rate teammate_avg_total_games teammate_avg_overall_win_rate opponent_avg_TOP_games_played opponent_avg_JUNGLE_games_played opponent_avg_MIDDLE_games_played opponent_avg_BOTTOM_games_played opponent_avg_UTILITY_games_played opponent_avg_TOP_win_rate opponent_avg_JUNGLE_win_rate opponent_avg_MIDDLE_win_rate opponent_avg_BOTTOM_win_rate opponent_avg_UTILITY_win_rate opponent_avg_total_games opponent_avg_overall_win_rate
0 PzaOrvw-CzAK4eaNn5d-so7swCCx5pwD9N-sJoVrrFyj3BLWa5abXPF5WxTjA6MkZaM65cwgaPrwtQ 1.0 235.0 18.0 0.0 6.0 0.000000 0.536170 0.388889 0.000000 0.666667 260 0.526923 7.512500 3.269231 6.180769 7.427885 6.341346 0.282205 0.226809 0.341659 0.323300 0.312858 30.731731 0.509404 7.278462 5.916154 6.633846 6.278462 5.459231 0.261894 0.276024 0.333557 0.307605 0.315497 31.566154 0.507218
1 2614d1Fw7-UV1HE4ALpI4YvHnApKnn1X1GoLYfZ7rgPUNdO8nD9MqAv34uYwqo_Lkx3nrm0jK8W0mg 124.0 1.0 1.0 3.0 15.0 0.483871 0.000000 0.000000 0.333333 0.400000 144 0.465278 8.512153 10.083333 6.989583 3.263889 5.722222 0.291860 0.271317 0.365129 0.249143 0.312629 34.571181 0.510198 6.834722 5.909722 6.270833 5.923611 5.712500 0.256418 0.268402 0.353182 0.302870 0.321966 30.651389 0.504959
2 H0aRGeflcbSveCqhCNyY0qSDWQXLCXZHY9chqX4fOwfXKLIOXvkTXpeP8VZt3xY2Jhi8_wajcUUjJA 2.0 0.0 80.0 6.0 38.0 0.000000 0.000000 0.425000 0.333333 0.473684 126 0.428571 6.617063 9.434524 4.412698 6.210317 5.253968 0.274838 0.290610 0.308358 0.292511 0.286963 31.928571 0.504284 7.136508 8.255556 7.366667 5.938095 5.842857 0.277010 0.285576 0.354699 0.270991 0.330912 34.539683 0.519247
3 oPpi9B9Ispp37iaiMH-5fGW-mBW3YaGsm-dxyIpFURbgG2TOPWED9MzX53yB-sxB21cHGFSumUpyJg 13.0 43.0 20.0 22.0 19.0 0.384615 0.604651 0.350000 0.409091 0.578947 117 0.495726 4.880342 4.585470 5.373932 5.425214 4.820513 0.288712 0.229256 0.318798 0.272216 0.274329 25.085470 0.493220 4.705983 5.695726 4.864957 4.200000 4.482051 0.260690 0.233276 0.296687 0.257088 0.284257 23.948718 0.480400
4 Hc_TRsKtT6pX-YpfeJtm1ks7iZTCoXyBtS0wqHJXKFBq_8r06xSKp2bYfJI82w25yuiZqqaOvqq2BQ 1.0 0.0 106.0 4.0 1.0 0.000000 0.000000 0.537736 0.500000 1.000000 112 0.535714 8.116071 10.366071 4.877232 8.029018 6.406250 0.257865 0.287532 0.354456 0.312866 0.287025 37.794643 0.508370 7.244643 12.580357 7.758929 7.667857 6.546429 0.247338 0.280110 0.348497 0.288221 0.330283 41.798214 0.508157
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2087 SoFo6vs5EuxQ4AJ7YfgR2o7aXiBw9nPIOtD51DsbUfUt9YHD9cQDLQL6DT-0KFVB9CV5Jkotqy2bSw 0.0 0.0 1.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1 0.000000 0.250000 0.250000 0.000000 0.250000 0.250000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.400000 0.400000 0.400000 0.200000 0.600000 0.400000 0.200000 0.400000 0.200000 0.400000 2.000000 1.000000
2088 SoAa5xeK65tEmRXmtwic6PAIOnEJUT94ALj0HpAq5aMd9fHIEcqtefnxv-AqSgfWwC40ixqjp1Xg6w 1.0 0.0 0.0 0.0 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 1 0.000000 0.250000 0.250000 0.250000 0.000000 0.250000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.600000 0.800000 1.000000 0.600000 0.600000 0.200000 0.133333 0.266667 0.300000 0.200000 3.600000 0.533333
2089 SlA0J3w7dXkeAHbgO5d-Df-W6-jobFeLMMx0ffV07CvBD3N6mnEahE8yZ9G_TzBj7PJcnVj63v1qWQ 0.0 0.0 0.0 0.0 1.0 0.000000 0.000000 0.000000 0.000000 0.000000 1 0.000000 0.250000 0.250000 0.250000 0.250000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.600000 0.400000 0.400000 0.400000 0.400000 0.600000 0.200000 0.200000 0.400000 0.200000 2.200000 1.000000
2090 SgR4KjLgzwh77SFQh0QDry9C1amoeyQddfvwUBuYtwiJCNbiW_ZOKa1WJuxydvQq4xuHZtrBccd4iQ 0.0 0.0 0.0 0.0 1.0 0.000000 0.000000 0.000000 0.000000 1.000000 1 1.000000 1.000000 0.750000 1.000000 0.750000 0.000000 0.187500 0.375000 0.416667 0.166667 0.000000 3.500000 0.708333 0.400000 0.400000 0.400000 0.400000 0.200000 0.100000 0.100000 0.100000 0.100000 0.000000 1.800000 0.400000
2091 zwrStDRjTTM_vfTBavvuCia-4itxnppD8_RfzxQehVF_nZml44K_7BiLwh6qcD7c6Rzwip2j9rEzpQ 0.0 0.0 0.0 0.0 1.0 0.000000 0.000000 0.000000 0.000000 0.000000 1 0.000000 0.250000 0.500000 0.500000 0.500000 0.000000 0.000000 0.125000 0.250000 0.000000 0.000000 1.750000 0.125000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 0.200000 1.000000 1.000000

2092 rows × 37 columns

In [ ]:
 
In [17]:
# Import required libraries
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

# Prepare the data
feature_cols = [
    'TOP_games_played', 
    'JUNGLE_games_played', 
    'MIDDLE_games_played', 
    'BOTTOM_games_played', 
    'UTILITY_games_played'
]

X = final_statistics_50[feature_cols]
y = final_statistics_50['overall_win_rate']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Create and train model
model = MLPRegressor(
    hidden_layer_sizes=(10, 8, 5),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size='auto',
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10
)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores.mean())

# Print results
print("\nModel Performance:")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R²: {r2:.4f}")
print(f"Cross-validation RMSE: {cv_rmse:.4f}")

# Plot results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Learning curve
ax1.plot(model.loss_curve_)
ax1.set_title('Learning Curve')
ax1.set_xlabel('Iteration')
ax1.set_ylabel('Loss')
ax1.grid(True)

# Predicted vs Actual
ax2.scatter(y_test, y_pred, alpha=0.5)
ax2.plot([min(y_test), max(y_test)], 
         [min(y_test), max(y_test)], 
         'r--', lw=2)
ax2.set_title('Predicted vs Actual Win Rates')
ax2.set_xlabel('Actual Win Rate')
ax2.set_ylabel('Predicted Win Rate')
ax2.grid(True)

plt.tight_layout()
plt.show()

# Example prediction for a new player
new_player = [[100, 50, 20, 10, 5]]  # [TOP, JUNGLE, MIDDLE, BOTTOM, UTILITY]
new_player_scaled = scaler.transform(new_player)
predicted_wr = model.predict(new_player_scaled)[0]
print(f"\nPredicted win rate for new player: {predicted_wr:.2f}%")
Model Performance:
Test RMSE: 0.3697
Test R²: -0.0118
Cross-validation RMSE: 0.4231
No description has been provided for this image
Predicted win rate for new player: -0.03%
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names
  warnings.warn(
In [ ]:
 

Initial SVC Model¶

In [18]:
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Prepare the features and target
individual_features = ['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                       'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate']

team_features = [col for col in final_statistics_50.columns if col.startswith('teammate_avg_') or col.startswith('opponent_avg_')]

features = individual_features + team_features

X = final_statistics_50[features]
y = (final_statistics_50['overall_win_rate'] > 0.5).astype(int)  # 1 if win rate > 50%, 0 otherwise

# Step 2: Split the data into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Step 3: Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the SVC model
svc = SVC(kernel='rbf', random_state=42)
svc.fit(X_train_scaled, y_train)

# Step 5: Evaluate the model on validation set
y_val_pred = svc.predict(X_val_scaled)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Step 6: Evaluate the model on test set
y_test_pred = svc.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Step 7: Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# Step 8: Plot confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Step 9: Learning Curve
train_sizes, train_scores, val_scores = learning_curve(
    svc, X_train_scaled, y_train, train_sizes=np.linspace(0.1, 1.0, 10),
    cv=5, scoring='accuracy', n_jobs=-1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='Training score')
plt.plot(train_sizes, np.mean(val_scores, axis=1), 'o-', label='Cross-validation score')
plt.xlabel('Training examples')
plt.ylabel('Score')
plt.title('Learning Curve for SVC')
plt.legend(loc='best')
plt.show()

# Step 10: ROC Curve
y_score = svc.decision_function(X_test_scaled)
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
Validation Accuracy: 0.8520
Test Accuracy: 0.8496

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.87       246
           1       0.84      0.79      0.81       173

    accuracy                           0.85       419
   macro avg       0.85      0.84      0.84       419
weighted avg       0.85      0.85      0.85       419

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

GridSearchCV Model¶

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import numpy as np

# Assuming X_train_scaled, X_test_scaled, y_train, y_test are already defined

# Set up parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Perform Grid Search
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Train model with best parameters
best_svc = grid_search.best_estimator_
best_svc.fit(X_train_scaled, y_train)

# Evaluate model
y_pred = best_svc.predict(X_test_scaled)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot new learning curve
train_sizes, train_scores, test_scores = learning_curve(
    best_svc, X_train_scaled, y_train, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10))

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.title('Learning Curve')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Get prediction probabilities
y_pred_proba = best_svc.decision_function(X_test_scaled)

# Calculate false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Calculate Area Under the Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
Best parameters: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
Best cross-validation score: 0.8899888446215141

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89       246
           1       0.84      0.87      0.85       173

    accuracy                           0.88       419
   macro avg       0.87      0.87      0.87       419
weighted avg       0.88      0.88      0.88       419

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [20]:
print("Confusion Matrix:")
print(cm)
Confusion Matrix:
[[217  29]
 [ 23 150]]
In [21]:
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
No description has been provided for this image

Optimising GridSearch CV Model¶

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming X and y are already defined 

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(f_classif)),
    ('svc', SVC())
])

# Define parameter grid
param_grid = {
    'feature_selection__k': [10, 20, 30],  # Number of features to select
    'svc__C': [0.1, 1, 10],  # Reduced from previous range
    'svc__gamma': [0.01, 0.1, 1],
    'svc__kernel': ['rbf', 'linear']
}

# Perform Grid Search with more cross-validation folds
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate model
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot learning curve
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    best_model, X_train, y_train, cv=10, n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.title('Learning Curve')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

# Plot ROC curve
from sklearn.metrics import roc_curve, auc

y_score = best_model.decision_function(X_test)
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
Best parameters: {'feature_selection__k': 20, 'svc__C': 10, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
Best cross-validation score: 0.8732641859138865

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89       246
           1       0.85      0.82      0.84       173

    accuracy                           0.87       419
   macro avg       0.86      0.86      0.86       419
weighted avg       0.87      0.87      0.87       419

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Pipeline of SVC Models¶

In [23]:
from scipy import stats
from sklearn.model_selection import cross_val_score

def get_model_stats(model, X, y, cv=10):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(scores)
    std_dev = np.std(scores)
    std_error = std_dev / np.sqrt(len(scores))
    ci = stats.t.interval(confidence=0.95, df=len(scores)-1, loc=mean_accuracy, scale=std_error)
    return {
        'mean_accuracy': mean_accuracy,
        'std_dev': std_dev,
        'std_error': std_error,
        'ci': ci
    }

# Assuming X and y are still from the original code

# For the first code (Basic SVM)
basic_svm_stats = get_model_stats(svc, X, y)

print("\nBasic SVM:")
print(f"Mean Accuracy ± 95% CI: {basic_svm_stats['mean_accuracy']:.4f} ± {(basic_svm_stats['ci'][1] - basic_svm_stats['mean_accuracy']):.4f}")
print(f"Std. Dev.: {basic_svm_stats['std_dev']:.4f}")
print(f"Std. Error: {basic_svm_stats['std_error']:.4f}")

# For the second code (GridSearch SVM)
grid_svm_stats = get_model_stats(best_svc, X, y)

print("\nGridSearch SVM:")
print(f"Mean Accuracy ± 95% CI: {grid_svm_stats['mean_accuracy']:.4f} ± {(grid_svm_stats['ci'][1] - grid_svm_stats['mean_accuracy']):.4f}")
print(f"Std. Dev.: {grid_svm_stats['std_dev']:.4f}")
print(f"Std. Error: {grid_svm_stats['std_error']:.4f}")

# For the third code (Pipeline SVM)
pipeline_svm_stats = get_model_stats(best_model, X, y)

print("\nPipeline SVM:")
print(f"Mean Accuracy ± 95% CI: {pipeline_svm_stats['mean_accuracy']:.4f} ± {(pipeline_svm_stats['ci'][1] - pipeline_svm_stats['mean_accuracy']):.4f}")
print(f"Std. Dev.: {pipeline_svm_stats['std_dev']:.4f}")
print(f"Std. Error: {pipeline_svm_stats['std_error']:.4f}")
Basic SVM:
Mean Accuracy ± 95% CI: 0.6502 ± 0.0489
Std. Dev.: 0.0684
Std. Error: 0.0216

GridSearch SVM:
Mean Accuracy ± 95% CI: 0.7512 ± 0.1323
Std. Dev.: 0.1850
Std. Error: 0.0585

Pipeline SVM:
Mean Accuracy ± 95% CI: 0.8587 ± 0.0921
Std. Dev.: 0.1288
Std. Error: 0.0407

KNN Models¶

In [24]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data 
df = final_statistics_50

# Define features and target
individual_features = ['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                       'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate']

team_features = [col for col in df.columns if col.startswith('teammate_avg_') or col.startswith('opponent_avg_')]

features = individual_features + team_features

X = df[features]
y = (df['overall_win_rate'] > 0.5).astype(int)  # 1 if win rate > 50%, 0 otherwise

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# Define parameter grid
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate model
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot learning curve
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    best_model, X_train, y_train, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.title('Learning Curve')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

# Plot ROC curve
y_scores = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
Best parameters: {'knn__metric': 'manhattan', 'knn__n_neighbors': 9, 'knn__weights': 'uniform'}
Best cross-validation score: 0.8428170524622397

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.86       246
           1       0.83      0.72      0.77       173

    accuracy                           0.82       419
   macro avg       0.82      0.81      0.81       419
weighted avg       0.82      0.82      0.82       419

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [20]:
from scipy import stats
from sklearn.model_selection import cross_val_score

def get_model_stats(model, X, y, cv=10):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(scores)
    std_dev = np.std(scores)
    std_error = std_dev / np.sqrt(len(scores))
    ci = stats.t.interval(confidence=0.95, df=len(scores)-1, loc=mean_accuracy, scale=std_error)
    return {
        'mean_accuracy': mean_accuracy,
        'std_dev': std_dev,
        'std_error': std_error,
        'ci': ci
    }

#best_model defined:
knn_stats = get_model_stats(best_model, X, y)

print("\nKNN Model Statistics:")
print(f"Mean Accuracy ± 95% CI: {knn_stats['mean_accuracy']:.4f} ± {(knn_stats['ci'][1] - knn_stats['mean_accuracy']):.4f}")
print(f"Std. Dev.: {knn_stats['std_dev']:.4f}")
print(f"Std. Error: {knn_stats['std_error']:.4f}")
KNN Model Statistics:
Mean Accuracy ± 95% CI: 0.8305 ± 0.1000
Std. Dev.: 0.1397
Std. Error: 0.0442
In [ ]:
 
In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = final_statistics_50

# Define features and target
individual_features = ['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                       'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate']
team_features = [col for col in df.columns if col.startswith('teammate_avg_') or col.startswith('opponent_avg_')]
features = individual_features + team_features
X = df[features]
y = (df['overall_win_rate'] > 0.5).astype(int)

# Split the data using stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with feature selection
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(f_classif)),
    ('knn', KNeighborsClassifier())
])

# Define an expanded parameter grid
param_grid = {
    'feature_selection__k': [10, 20, 30, 'all'],
    'knn__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'minkowski'],
    'knn__p': [1, 2, 3]  # Only used when metric='minkowski'
}

# Perform Grid Search with stratified k-fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate model
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot learning curve
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    best_model, X_train, y_train, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.title('Learning Curve')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

# Plot ROC curve
y_scores = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Calculate model statistics
from scipy import stats

def get_model_stats(model, X, y, cv=10):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(scores)
    std_dev = np.std(scores)
    std_error = std_dev / np.sqrt(len(scores))
    ci = stats.t.interval(confidence=0.95, df=len(scores)-1, loc=mean_accuracy, scale=std_error)
    return {
        'mean_accuracy': mean_accuracy,
        'std_dev': std_dev,
        'std_error': std_error,
        'ci': ci
    }

knn_stats = get_model_stats(best_model, X, y)

print("\nKNN Model Statistics:")
print(f"Mean Accuracy ± 95% CI: {knn_stats['mean_accuracy']:.4f} ± {(knn_stats['ci'][1] - knn_stats['mean_accuracy']):.4f}")
print(f"Std. Dev.: {knn_stats['std_dev']:.4f}")
print(f"Std. Error: {knn_stats['std_error']:.4f}")

# Print feature importances
feature_selector = best_model.named_steps['feature_selection']
selected_features = X.columns[feature_selector.get_support()]
print("\nSelected Features:")
for feature in selected_features:
    print(feature)
Best parameters: {'feature_selection__k': 20, 'knn__metric': 'manhattan', 'knn__n_neighbors': 13, 'knn__p': 1, 'knn__weights': 'distance'}
Best cross-validation score: 0.848787201715971

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       236
           1       0.88      0.75      0.81       183

    accuracy                           0.85       419
   macro avg       0.85      0.84      0.84       419
weighted avg       0.85      0.85      0.85       419

Accuracy: 0.8473
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
KNN Model Statistics:
Mean Accuracy ± 95% CI: 0.8577 ± 0.0865
Std. Dev.: 0.1209
Std. Error: 0.0382

Selected Features:
MIDDLE_games_played
TOP_win_rate
JUNGLE_win_rate
MIDDLE_win_rate
BOTTOM_win_rate
UTILITY_win_rate
teammate_avg_UTILITY_games_played
teammate_avg_TOP_win_rate
teammate_avg_JUNGLE_win_rate
teammate_avg_MIDDLE_win_rate
teammate_avg_BOTTOM_win_rate
teammate_avg_UTILITY_win_rate
teammate_avg_overall_win_rate
opponent_avg_BOTTOM_games_played
opponent_avg_TOP_win_rate
opponent_avg_JUNGLE_win_rate
opponent_avg_MIDDLE_win_rate
opponent_avg_BOTTOM_win_rate
opponent_avg_UTILITY_win_rate
opponent_avg_overall_win_rate

Random Forest Model¶

In [25]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data 
df = final_statistics_50

# Define features and target
individual_features = ['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                       'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate']

team_features = [col for col in df.columns if col.startswith('teammate_avg_') or col.startswith('opponent_avg_')]

features = individual_features + team_features

X = df[features]
y = (df['overall_win_rate'] > 0.5).astype(int)  # 1 if win rate > 50%, 0 otherwise

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Define parameter grid
param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2', None]
}

# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate model
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot learning curve
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(
    best_model, X_train, y_train, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='Validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.title('Learning Curve')
plt.xlabel('Training Examples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

# Plot ROC curve
y_scores = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Feature importance
importances = best_model.named_steps['rf'].feature_importances_
feature_imp = pd.DataFrame(sorted(zip(importances, X.columns), reverse=True), columns=['Importance', 'Feature'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_imp.head(20))
plt.title('Top 20 Feature Importances')
plt.tight_layout()
plt.show()

print("\nTop 20 features by importance:")
print(feature_imp.head(20))
Best parameters: {'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5, 'rf__n_estimators': 300}
Best cross-validation score: 0.9079524533023505

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.90       246
           1       0.87      0.83      0.85       173

    accuracy                           0.88       419
   macro avg       0.87      0.87      0.87       419
weighted avg       0.88      0.88      0.88       419

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Top 20 features by importance:
    Importance                            Feature
0     0.195682      teammate_avg_overall_win_rate
1     0.153645      opponent_avg_overall_win_rate
2     0.057326      opponent_avg_UTILITY_win_rate
3     0.042912       opponent_avg_MIDDLE_win_rate
4     0.040354       opponent_avg_JUNGLE_win_rate
5     0.039057          opponent_avg_TOP_win_rate
6     0.037344       teammate_avg_MIDDLE_win_rate
7     0.036702      teammate_avg_UTILITY_win_rate
8     0.034723       opponent_avg_BOTTOM_win_rate
9     0.031792          teammate_avg_TOP_win_rate
10    0.029231       teammate_avg_BOTTOM_win_rate
11    0.023563       teammate_avg_JUNGLE_win_rate
12    0.022938                   UTILITY_win_rate
13    0.021174                       TOP_win_rate
14    0.020919                    JUNGLE_win_rate
15    0.019981                    MIDDLE_win_rate
16    0.014661           opponent_avg_total_games
17    0.014393                    BOTTOM_win_rate
18    0.014256  opponent_avg_UTILITY_games_played
19    0.014072  teammate_avg_UTILITY_games_played

GridSearchCV Random Forest¶

In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the data
df = final_statistics_50

# Define features and target
individual_features = ['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                       'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate']

team_features = [col for col in df.columns if col.startswith('teammate_avg_') or col.startswith('opponent_avg_')]

features = individual_features + team_features

# Feature engineering
df['team_win_rate_diff'] = df['teammate_avg_overall_win_rate'] - df['opponent_avg_overall_win_rate']
for role in ['TOP', 'JUNGLE', 'MIDDLE', 'BOTTOM', 'UTILITY']:
    df[f'{role}_win_rate_diff'] = df[f'teammate_avg_{role}_win_rate'] - df[f'opponent_avg_{role}_win_rate']

features += ['team_win_rate_diff', 'TOP_win_rate_diff', 'JUNGLE_win_rate_diff', 'MIDDLE_win_rate_diff', 'BOTTOM_win_rate_diff', 'UTILITY_win_rate_diff']

X = df[features]
y = (df['overall_win_rate'] > 0.5).astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Define parameter grid
param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [10, 20, 30, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__oob_score': [True]  # Added for OOB score calculation
}

# Perform Grid Search with Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate model
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Threshold tuning
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[np.argmax(f1_scores)]

print(f"\nOptimal threshold based on F1 score: {optimal_threshold:.3f}")

y_pred_tuned = (y_prob >= optimal_threshold).astype(int)
print("\nClassification Report with tuned threshold:")
print(classification_report(y_test, y_pred_tuned))

# Feature importance
importances = best_model.named_steps['rf'].feature_importances_
feature_imp = pd.DataFrame(sorted(zip(importances, X.columns), reverse=True), columns=['Importance', 'Feature'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_imp.head(20))
plt.title('Top 20 Feature Importances')
plt.tight_layout()
plt.show()

# Function to calculate model statistics
def get_model_stats(model, X, y, cv=10):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(scores)
    std_dev = np.std(scores)
    std_error = std_dev / np.sqrt(len(scores))
    ci = stats.t.interval(confidence=0.95, df=len(scores)-1, loc=mean_accuracy, scale=std_error)
    return {
        'mean_accuracy': mean_accuracy,
        'std_dev': std_dev,
        'std_error': std_error,
        'ci': ci
    }

# Calculate and print model statistics
model_stats = get_model_stats(best_model, X, y)

print("\nModel Statistics:")
print(f"Mean Accuracy ± 95% CI: {model_stats['mean_accuracy']:.4f} ± {(model_stats['ci'][1] - model_stats['mean_accuracy']):.4f}")
print(f"Standard Deviation: {model_stats['std_dev']:.4f}")
print(f"Standard Error: {model_stats['std_error']:.4f}")

# Function to plot learning curve
def plot_learning_curve(estimator, X, y, title="Learning Curve", axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    if axes is None:
        _, axes = plt.subplots(1, 1, figsize=(10, 5))

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                      train_scores_mean + train_scores_std, alpha=0.1,
                      color="r")
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                      test_scores_mean + test_scores_std, alpha=0.1,
                      color="g")
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
              label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
              label="Cross-validation score")
    axes.legend(loc="best")

    return plt

# Plot learning curve
plt.figure(figsize=(10, 6))
plot_learning_curve(best_model, X, y, cv=5)
plt.title("Learning Curve for Random Forest")
plt.show()

# Print OOB score
oob_score = best_model.named_steps['rf'].oob_score_
print(f"\nOut-of-Bag Score: {oob_score:.4f}")

print("\nTop 20 features by importance:")
print(feature_imp.head(20))
Best parameters: {'rf__max_depth': 20, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100, 'rf__oob_score': True}
Best cross-validation score: 0.9668960609234478

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       246
           1       0.84      0.82      0.83       173

    accuracy                           0.86       419
   macro avg       0.86      0.85      0.85       419
weighted avg       0.86      0.86      0.86       419

No description has been provided for this image
No description has been provided for this image
Optimal threshold based on F1 score: 0.480

Classification Report with tuned threshold:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       246
           1       0.83      0.84      0.83       173

    accuracy                           0.86       419
   macro avg       0.86      0.86      0.86       419
weighted avg       0.86      0.86      0.86       419

No description has been provided for this image
Model Statistics:
Mean Accuracy ± 95% CI: 0.8911 ± 0.0631
Standard Deviation: 0.0882
Standard Error: 0.0279
<Figure size 1000x600 with 0 Axes>
No description has been provided for this image
Out-of-Bag Score: 0.8942

Top 20 features by importance:
    Importance                            Feature
0     0.158953                 team_win_rate_diff
1     0.142574      teammate_avg_overall_win_rate
2     0.116788      opponent_avg_overall_win_rate
3     0.037056      opponent_avg_UTILITY_win_rate
4     0.034241       opponent_avg_BOTTOM_win_rate
5     0.030514               MIDDLE_win_rate_diff
6     0.024179                  TOP_win_rate_diff
7     0.023824       opponent_avg_MIDDLE_win_rate
8     0.023413      teammate_avg_UTILITY_win_rate
9     0.022495          opponent_avg_TOP_win_rate
10    0.020624       opponent_avg_JUNGLE_win_rate
11    0.020212       teammate_avg_MIDDLE_win_rate
12    0.019304          teammate_avg_TOP_win_rate
13    0.019282              UTILITY_win_rate_diff
14    0.018722               BOTTOM_win_rate_diff
15    0.017876       teammate_avg_BOTTOM_win_rate
16    0.016920               JUNGLE_win_rate_diff
17    0.016723                   UTILITY_win_rate
18    0.016390  teammate_avg_UTILITY_games_played
19    0.016110           teammate_avg_total_games

Addressing the problem of overfitting in the random forest model¶

In [24]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the data
df = final_statistics_50

# Define features and target
individual_features = ['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                       'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate']
team_features = [col for col in df.columns if col.startswith('teammate_avg_') or col.startswith('opponent_avg_')]
features = individual_features + team_features

# Feature engineering
df['team_win_rate_diff'] = df['teammate_avg_overall_win_rate'] - df['opponent_avg_overall_win_rate']
for role in ['TOP', 'JUNGLE', 'MIDDLE', 'BOTTOM', 'UTILITY']:
    df[f'{role}_win_rate_diff'] = df[f'teammate_avg_{role}_win_rate'] - df[f'opponent_avg_{role}_win_rate']
features += ['team_win_rate_diff', 'TOP_win_rate_diff', 'JUNGLE_win_rate_diff', 'MIDDLE_win_rate_diff', 'BOTTOM_win_rate_diff', 'UTILITY_win_rate_diff']

X = df[features]
y = (df['overall_win_rate'] > 0.5).astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
    ('rf', RandomForestClassifier(random_state=42))
])

# Define parameter distribution
param_dist = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [10, 20, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2']
}

# Perform Randomized Search
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=20, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate model
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Feature importance
importances = best_model.named_steps['rf'].feature_importances_
feature_imp = pd.DataFrame(sorted(zip(importances, X.columns), reverse=True), columns=['Importance', 'Feature'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_imp.head(20))
plt.title('Top 20 Feature Importances')
plt.tight_layout()
plt.show()

# Function to calculate model statistics
def get_model_stats(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(scores)
    std_dev = np.std(scores)
    std_error = std_dev / np.sqrt(len(scores))
    ci = stats.t.interval(confidence=0.95, df=len(scores)-1, loc=mean_accuracy, scale=std_error)
    return {
        'mean_accuracy': mean_accuracy,
        'std_dev': std_dev,
        'std_error': std_error,
        'ci': ci
    }

# Calculate and print model statistics
model_stats = get_model_stats(best_model, X, y)

print("\nModel Statistics:")
print(f"Mean Accuracy ± 95% CI: {model_stats['mean_accuracy']:.4f} ± {(model_stats['ci'][1] - model_stats['mean_accuracy']):.4f}")
print(f"Standard Deviation: {model_stats['std_dev']:.4f}")
print(f"Standard Error: {model_stats['std_error']:.4f}")

print("\nTop 20 features by importance:")
print(feature_imp.head(20))
Best parameters: {'rf__n_estimators': 100, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 1, 'rf__max_features': 'log2', 'rf__max_depth': 10}
Best cross-validation score: 0.9515358664596704

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.86       236
           1       0.83      0.78      0.80       183

    accuracy                           0.83       419
   macro avg       0.83      0.83      0.83       419
weighted avg       0.83      0.83      0.83       419

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Model Statistics:
Mean Accuracy ± 95% CI: 0.8362 ± 0.1545
Standard Deviation: 0.1245
Standard Error: 0.0557

Top 20 features by importance:
   Importance               Feature
0    0.304805          TOP_win_rate
1    0.256290      TOP_games_played
2    0.177613  UTILITY_games_played
3    0.076685   BOTTOM_games_played
4    0.071446   JUNGLE_games_played
5    0.060004       JUNGLE_win_rate
6    0.053156   MIDDLE_games_played
In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the data
df = final_statistics_50  

# Define features and target
individual_features = ['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                       'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate']
team_features = [col for col in df.columns if col.startswith('teammate_avg_') or col.startswith('opponent_avg_')]
features = individual_features + team_features

# Feature engineering
df['team_win_rate_diff'] = df['teammate_avg_overall_win_rate'] - df['opponent_avg_overall_win_rate']
for role in ['TOP', 'JUNGLE', 'MIDDLE', 'BOTTOM', 'UTILITY']:
    df[f'{role}_win_rate_diff'] = df[f'teammate_avg_{role}_win_rate'] - df[f'opponent_avg_{role}_win_rate']
features += ['team_win_rate_diff', 'TOP_win_rate_diff', 'JUNGLE_win_rate_diff', 'MIDDLE_win_rate_diff', 'BOTTOM_win_rate_diff', 'UTILITY_win_rate_diff']

X = df[features]
y = (df['overall_win_rate'] > 0.5).astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
    ('rf', RandomForestClassifier(random_state=42))
])

# Define parameter distribution
param_dist = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [10, 20, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__oob_score': [True, False]
}

# Perform Randomized Search
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=20, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate model
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Feature importance
importances = best_model.named_steps['rf'].feature_importances_
feature_imp = pd.DataFrame(sorted(zip(importances, X.columns), reverse=True), columns=['Importance', 'Feature'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_imp.head(20))
plt.title('Top 20 Feature Importances')
plt.tight_layout()
plt.show()

# Function to calculate model statistics
def get_model_stats(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(scores)
    std_dev = np.std(scores)
    std_error = std_dev / np.sqrt(len(scores))
    ci = stats.t.interval(confidence=0.95, df=len(scores)-1, loc=mean_accuracy, scale=std_error)
    return {
        'mean_accuracy': mean_accuracy,
        'std_dev': std_dev,
        'std_error': std_error,
        'ci': ci
    }

# Calculate and print model statistics
model_stats = get_model_stats(best_model, X, y)

print("\nModel Statistics:")
print(f"Mean Accuracy ± 95% CI: {model_stats['mean_accuracy']:.4f} ± {(model_stats['ci'][1] - model_stats['mean_accuracy']):.4f}")
print(f"Standard Deviation: {model_stats['std_dev']:.4f}")
print(f"Standard Error: {model_stats['std_error']:.4f}")

# Function to plot learning curve
def plot_learning_curve(estimator, X, y, title="Learning Curve", axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    if axes is None:
        _, axes = plt.subplots(1, 1, figsize=(10, 5))

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                      train_scores_mean + train_scores_std, alpha=0.1,
                      color="r")
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                      test_scores_mean + test_scores_std, alpha=0.1,
                      color="g")
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
              label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
              label="Cross-validation score")
    axes.legend(loc="best")

    return plt

# Plot learning curve
plt.figure(figsize=(10, 6))
plot_learning_curve(best_model, X, y, cv=5)
plt.title("Learning Curve for Random Forest")
plt.show()

print("\nTop 20 features by importance:")
print(feature_imp.head(20))

# If Random Forest has oob_score enabled, print the OOB score
if hasattr(best_model.named_steps['rf'], 'oob_score_'):
    print(f"\nOut-of-Bag Score: {best_model.named_steps['rf'].oob_score_:.4f}")
Best parameters: {'rf__oob_score': False, 'rf__n_estimators': 100, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2, 'rf__max_features': 'sqrt', 'rf__max_depth': 10}
Best cross-validation score: 0.9514875871654404

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       236
           1       0.83      0.78      0.80       183

    accuracy                           0.83       419
   macro avg       0.83      0.82      0.83       419
weighted avg       0.83      0.83      0.83       419

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Model Statistics:
Mean Accuracy ± 95% CI: 0.8405 ± 0.1522
Standard Deviation: 0.1226
Standard Error: 0.0548
<Figure size 1000x600 with 0 Axes>
No description has been provided for this image
Top 20 features by importance:
   Importance               Feature
0    0.294969          TOP_win_rate
1    0.274182      TOP_games_played
2    0.178220  UTILITY_games_played
3    0.076599   BOTTOM_games_played
4    0.068155   JUNGLE_games_played
5    0.057892       JUNGLE_win_rate
6    0.049983   MIDDLE_games_played

Further Improvements¶

In [26]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load the data
df = final_statistics_50

# Define features and target
individual_features = ['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played',
                       'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate']
team_features = [col for col in df.columns if col.startswith('teammate_avg_') or col.startswith('opponent_avg_')]
features = individual_features + team_features

# Feature engineering
df['team_win_rate_diff'] = df['teammate_avg_overall_win_rate'] - df['opponent_avg_overall_win_rate']
for role in ['TOP', 'JUNGLE', 'MIDDLE', 'BOTTOM', 'UTILITY']:
    df[f'{role}_win_rate_diff'] = df[f'teammate_avg_{role}_win_rate'] - df[f'opponent_avg_{role}_win_rate']
features += ['team_win_rate_diff', 'TOP_win_rate_diff', 'JUNGLE_win_rate_diff', 'MIDDLE_win_rate_diff', 'BOTTOM_win_rate_diff', 'UTILITY_win_rate_diff']

X = df[features]
y = (df['overall_win_rate'] > 0.5).astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline with simplified feature selection
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
    ('rf', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# Define focused parameter distribution
param_dist = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [5, 10, 15],
    'rf__min_samples_split': [5, 10],
    'rf__min_samples_leaf': [4, 8],
    'rf__max_features': ['sqrt', 'log2']
}

# Perform Randomized Search with fewer iterations
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, 
                                 n_iter=20, cv=5, scoring='roc_auc', 
                                 n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Print best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

# Evaluate model
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Function to plot learning curve
def plot_learning_curve(estimator, X, y, title="Learning Curve", axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    if axes is None:
        _, axes = plt.subplots(1, 1, figsize=(10, 5))

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                      train_scores_mean + train_scores_std, alpha=0.1,
                      color="r")
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                      test_scores_mean + test_scores_std, alpha=0.1,
                      color="g")
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
              label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
              label="Cross-validation score")
    axes.legend(loc="best")
    return plt

# Plot learning curve
plt.figure(figsize=(10, 6))
plot_learning_curve(best_model, X, y, cv=5)
plt.title("Learning Curve for Random Forest")
plt.show()

# Calculate model statistics
def get_model_stats(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(scores)
    std_dev = np.std(scores)
    std_error = std_dev / np.sqrt(len(scores))
    ci = stats.t.interval(confidence=0.95, df=len(scores)-1, loc=mean_accuracy, scale=std_error)
    return {
        'mean_accuracy': mean_accuracy,
        'std_dev': std_dev,
        'std_error': std_error,
        'ci': ci
    }

# Get and print model statistics
model_stats = get_model_stats(best_model, X, y)
print("\nModel Statistics:")
print(f"Mean Accuracy ± 95% CI: {model_stats['mean_accuracy']:.4f} ± {(model_stats['ci'][1] - model_stats['mean_accuracy']):.4f}")
print(f"Standard Deviation: {model_stats['std_dev']:.4f}")
print(f"Standard Error: {model_stats['std_error']:.4f}")

# Print feature importances
importances = best_model.named_steps['rf'].feature_importances_
selected_mask = best_model.named_steps['feature_selection'].get_support()
selected_features = X.columns[selected_mask]
feature_imp = pd.DataFrame({'Feature': selected_features, 
                           'Importance': importances[selected_mask]})
feature_imp = feature_imp.sort_values('Importance', ascending=False)

print("\nSelected Features and Their Importances:")
print(feature_imp)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_imp)
plt.title('Feature Importances (After Selection)')
plt.tight_layout()
plt.show()
Best parameters: {'rf__n_estimators': 200, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 4, 'rf__max_features': 'log2', 'rf__max_depth': 10}
Best cross-validation score: 0.9536274475384673

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       246
           1       0.77      0.84      0.80       173

    accuracy                           0.83       419
   macro avg       0.83      0.83      0.83       419
weighted avg       0.83      0.83      0.83       419

No description has been provided for this image
<Figure size 1000x600 with 0 Axes>
No description has been provided for this image
Model Statistics:
Mean Accuracy ± 95% CI: 0.8443 ± 0.1496
Standard Deviation: 0.1205
Standard Error: 0.0539
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[26], line 146
    143 selected_mask = best_model.named_steps['feature_selection'].get_support()
    144 selected_features = X.columns[selected_mask]
    145 feature_imp = pd.DataFrame({'Feature': selected_features, 
--> 146                            'Importance': importances[selected_mask]})
    147 feature_imp = feature_imp.sort_values('Importance', ascending=False)
    149 print("\nSelected Features and Their Importances:")

IndexError: boolean index did not match indexed array along dimension 0; dimension is 6 but corresponding boolean dimension is 40

Advanced Models¶

Trying to reduce overfitting¶

In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from scipy.stats import t

# Create pipeline
xgb_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBClassifier(
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    ))
])

# Define focused parameter grid with strong regularization
xgb_param_grid = {
    'xgb__n_estimators': [100],
    'xgb__max_depth': [2, 3],
    'xgb__learning_rate': [0.01],0
    'xgb__min_child_weight': [5],
    'xgb__subsample': [0.6],
    'xgb__colsample_bytree': [0.6],
    'xgb__gamma': [0.2],
    'xgb__reg_alpha': [1],
    'xgb__reg_lambda': [2]
}

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Perform Grid Search
print("Training model...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    xgb_pipeline, 
    xgb_param_grid, 
    cv=cv, 
    scoring='roc_auc', 
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Print results
print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Get best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Learning curve
def plot_learning_curve(estimator, X, y, title):
    train_sizes = np.linspace(0.1, 1.0, 10)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y,
        train_sizes=train_sizes,
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color='r', label='Training score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='r')
    plt.plot(train_sizes, test_mean, 'o-', color='g', label='Cross-validation score')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='g')
    
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()
    
    print(f"\nFinal scores:")
    print(f"Training Score: {train_mean[-1]:.4f} ± {train_std[-1]:.4f}")
    print(f"CV Score: {test_mean[-1]:.4f} ± {test_std[-1]:.4f}")

# Plot learning curve
plot_learning_curve(best_model, X, y, "XGBoost Learning Curve")

# Calculate model statistics
scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')
mean_accuracy = np.mean(scores)
std_dev = np.std(scores)
std_error = std_dev / np.sqrt(len(scores))
ci = t.interval(0.95, len(scores)-1, loc=mean_accuracy, scale=std_error)

print("\nModel Statistics:")
print(f"Mean Accuracy ± 95% CI: {mean_accuracy:.4f} ± {(ci[1] - mean_accuracy):.4f}")
print(f"Standard Deviation: {std_dev:.4f}")
print(f"Standard Error: {std_error:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.named_steps['xgb'].feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
plt.title('Feature Importance')
plt.tight_layout()
plt.show()
Training model...
Fitting 5 folds for each of 2 candidates, totalling 10 fits
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Best parameters: {'xgb__colsample_bytree': 0.6, 'xgb__gamma': 0.2, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__min_child_weight': 5, 'xgb__n_estimators': 100, 'xgb__reg_alpha': 1, 'xgb__reg_lambda': 2, 'xgb__subsample': 0.6}
Best cross-validation score: 0.9576052280428347

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.87       236
           1       0.85      0.79      0.82       183

    accuracy                           0.84       419
   macro avg       0.85      0.84      0.84       419
weighted avg       0.84      0.84      0.84       419

No description has been provided for this image
No description has been provided for this image
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:44] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
No description has been provided for this image
Final scores:
Training Score: 0.8938 ± 0.0210
CV Score: 0.8433 ± 0.1235
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
/Users/devinpathiraja/anaconda3/envs/CAB420/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [11:24:45] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Model Statistics:
Mean Accuracy ± 95% CI: 0.8429 ± 0.1534
Standard Deviation: 0.1236
Standard Error: 0.0553
No description has been provided for this image

Learning Curves¶

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.base import clone  # Added this import
import matplotlib.pyplot as plt

def plot_learning_curves(estimator, X, y, title):
    """
    Plot both accuracy and log loss learning curves for training and validation sets.
    
    Parameters:
    -----------
    estimator : estimator object
        A fitted estimator object implementing 'fit' and 'predict_proba'
    X : array-like
        Training data
    y : array-like
        Target values
    title : string
        Plot title
    """
    train_sizes = np.linspace(0.1, 1.0, 10)
    
    # Get accuracy scores
    train_sizes_acc, train_scores_acc, test_scores_acc = learning_curve(
        estimator, X, y,
        train_sizes=train_sizes,
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )
    
    # Calculate log loss manually
    train_scores_loss = []
    test_scores_loss = []
    
    for train_idx, valid_idx in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X, y):
        X_train_cv, X_valid_cv = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_cv, y_valid_cv = y.iloc[train_idx], y.iloc[valid_idx]
        
        # Initialize lists for this fold
        train_loss_fold = []
        valid_loss_fold = []
        
        for size in train_sizes:
            # Take a subset of training data
            subset_size = int(len(X_train_cv) * size)
            X_subset = X_train_cv.iloc[:subset_size]
            y_subset = y_train_cv.iloc[:subset_size]
            
            # Fit model on subset
            model_clone = clone(estimator)
            model_clone.fit(X_subset, y_subset)
            
            # Calculate losses
            train_proba = model_clone.predict_proba(X_subset)
            valid_proba = model_clone.predict_proba(X_valid_cv)
            
            train_loss = log_loss(y_subset, train_proba)
            valid_loss = log_loss(y_valid_cv, valid_proba)
            
            train_loss_fold.append(train_loss)
            valid_loss_fold.append(valid_loss)
        
        train_scores_loss.append(train_loss_fold)
        test_scores_loss.append(valid_loss_fold)
    
    train_scores_loss = np.array(train_scores_loss)
    test_scores_loss = np.array(test_scores_loss)
    
    # Calculate statistics
    train_mean_acc = np.mean(train_scores_acc, axis=1)
    train_std_acc = np.std(train_scores_acc, axis=1)
    test_mean_acc = np.mean(test_scores_acc, axis=1)
    test_std_acc = np.std(test_scores_acc, axis=1)
    
    train_mean_loss = np.mean(train_scores_loss, axis=0)
    train_std_loss = np.std(train_scores_loss, axis=0)
    test_mean_loss = np.mean(test_scores_loss, axis=0)
    test_std_loss = np.std(test_scores_loss, axis=0)
    
    # Create subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot accuracy
    ax1.plot(train_sizes, train_mean_acc, 'o-', color='r', label='Training accuracy')
    ax1.fill_between(train_sizes, train_mean_acc - train_std_acc, 
                     train_mean_acc + train_std_acc, alpha=0.1, color='r')
    ax1.plot(train_sizes, test_mean_acc, 'o-', color='g', label='Cross-validation accuracy')
    ax1.fill_between(train_sizes, test_mean_acc - test_std_acc,
                     test_mean_acc + test_std_acc, alpha=0.1, color='g')
    ax1.set_xlabel('Training examples')
    ax1.set_ylabel('Accuracy')
    ax1.set_title('Learning Curve - Accuracy')
    ax1.legend(loc='best')
    ax1.grid(True)
    
    # Plot loss
    ax2.plot(train_sizes, train_mean_loss, 'o-', color='r', label='Training loss')
    ax2.fill_between(train_sizes, train_mean_loss - train_std_loss,
                     train_mean_loss + train_std_loss, alpha=0.1, color='r')
    ax2.plot(train_sizes, test_mean_loss, 'o-', color='g', label='Cross-validation loss')
    ax2.fill_between(train_sizes, test_mean_loss - test_std_loss,
                     test_mean_loss + test_std_loss, alpha=0.1, color='g')
    ax2.set_xlabel('Training examples')
    ax2.set_ylabel('Log Loss')
    ax2.set_title('Learning Curve - Log Loss')
    ax2.legend(loc='best')
    ax2.grid(True)
    
    plt.suptitle(title)
    plt.tight_layout()
    plt.show()
    
    # Print final scores
    print(f"\nFinal scores:")
    print(f"Training Accuracy: {train_mean_acc[-1]:.4f} ± {train_std_acc[-1]:.4f}")
    print(f"CV Accuracy: {test_mean_acc[-1]:.4f} ± {test_std_acc[-1]:.4f}")
    print(f"Training Loss: {train_mean_loss[-1]:.4f} ± {train_std_loss[-1]:.4f}")
    print(f"CV Loss: {test_mean_loss[-1]:.4f} ± {test_std_loss[-1]:.4f}")

# Replace the existing learning curve call with:
plot_learning_curves(best_model, X, y, "XGBoost Learning Curves")
No description has been provided for this image
Final scores:
Training Accuracy: 0.9450 ± 0.0184
CV Accuracy: 0.8443 ± 0.1199
Training Loss: 0.1671 ± 0.0058
CV Loss: 0.2821 ± 0.0188

Checking for features¶

In [30]:
print("Available features:")
print(X.columns.tolist())
print("\nShape of data:")
print(X.shape)

# Also check for any null values
print("\nNull values in dataset:")
print(X.isnull().sum())
Available features:
['TOP_games_played', 'JUNGLE_games_played', 'MIDDLE_games_played', 'BOTTOM_games_played', 'UTILITY_games_played', 'TOP_win_rate', 'JUNGLE_win_rate', 'MIDDLE_win_rate', 'BOTTOM_win_rate', 'UTILITY_win_rate', 'teammate_avg_TOP_games_played', 'teammate_avg_JUNGLE_games_played', 'teammate_avg_MIDDLE_games_played', 'teammate_avg_BOTTOM_games_played', 'teammate_avg_UTILITY_games_played', 'teammate_avg_TOP_win_rate', 'teammate_avg_JUNGLE_win_rate', 'teammate_avg_MIDDLE_win_rate', 'teammate_avg_BOTTOM_win_rate', 'teammate_avg_UTILITY_win_rate', 'teammate_avg_total_games', 'teammate_avg_overall_win_rate', 'opponent_avg_TOP_games_played', 'opponent_avg_JUNGLE_games_played', 'opponent_avg_MIDDLE_games_played', 'opponent_avg_BOTTOM_games_played', 'opponent_avg_UTILITY_games_played', 'opponent_avg_TOP_win_rate', 'opponent_avg_JUNGLE_win_rate', 'opponent_avg_MIDDLE_win_rate', 'opponent_avg_BOTTOM_win_rate', 'opponent_avg_UTILITY_win_rate', 'opponent_avg_total_games', 'opponent_avg_overall_win_rate', 'team_win_rate_diff', 'TOP_win_rate_diff', 'JUNGLE_win_rate_diff', 'MIDDLE_win_rate_diff', 'BOTTOM_win_rate_diff', 'UTILITY_win_rate_diff']

Shape of data:
(2092, 40)

Null values in dataset:
TOP_games_played                     0
JUNGLE_games_played                  0
MIDDLE_games_played                  0
BOTTOM_games_played                  0
UTILITY_games_played                 0
TOP_win_rate                         0
JUNGLE_win_rate                      0
MIDDLE_win_rate                      0
BOTTOM_win_rate                      0
UTILITY_win_rate                     0
teammate_avg_TOP_games_played        0
teammate_avg_JUNGLE_games_played     0
teammate_avg_MIDDLE_games_played     0
teammate_avg_BOTTOM_games_played     0
teammate_avg_UTILITY_games_played    0
teammate_avg_TOP_win_rate            0
teammate_avg_JUNGLE_win_rate         0
teammate_avg_MIDDLE_win_rate         0
teammate_avg_BOTTOM_win_rate         0
teammate_avg_UTILITY_win_rate        0
teammate_avg_total_games             0
teammate_avg_overall_win_rate        0
opponent_avg_TOP_games_played        0
opponent_avg_JUNGLE_games_played     0
opponent_avg_MIDDLE_games_played     0
opponent_avg_BOTTOM_games_played     0
opponent_avg_UTILITY_games_played    0
opponent_avg_TOP_win_rate            0
opponent_avg_JUNGLE_win_rate         0
opponent_avg_MIDDLE_win_rate         0
opponent_avg_BOTTOM_win_rate         0
opponent_avg_UTILITY_win_rate        0
opponent_avg_total_games             0
opponent_avg_overall_win_rate        0
team_win_rate_diff                   0
TOP_win_rate_diff                    0
JUNGLE_win_rate_diff                 0
MIDDLE_win_rate_diff                 0
BOTTOM_win_rate_diff                 0
UTILITY_win_rate_diff                0
dtype: int64
In [ ]:
 
In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Create pipeline with modified configuration
xgb_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBClassifier(
        random_state=42,
        objective='binary:logistic',
        scale_pos_weight=1,
        enable_categorical=False
    ))
])

# Simplified parameter grid
xgb_param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 4],
    'xgb__learning_rate': [0.01, 0.05],
    'xgb__min_child_weight': [5],
    'xgb__subsample': [0.7],
    'xgb__colsample_bytree': [0.7],
    'xgb__gamma': [0.1],
    'xgb__reg_alpha': [1],
    'xgb__reg_lambda': [2]
}

# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit the model
print("Starting model training...")
grid_search.fit(X_train, y_train)

# Print results
print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Get best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.named_steps['xgb'].feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
plt.title('Feature Importance')
plt.tight_layout()
plt.show()
Starting model training...
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Best parameters: {'xgb__colsample_bytree': 0.7, 'xgb__gamma': 0.1, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 4, 'xgb__min_child_weight': 5, 'xgb__n_estimators': 200, 'xgb__reg_alpha': 1, 'xgb__reg_lambda': 2, 'xgb__subsample': 0.7}
Best cross-validation score: 0.9754360502580512

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.92       236
           1       0.91      0.90      0.90       183

    accuracy                           0.91       419
   macro avg       0.91      0.91      0.91       419
weighted avg       0.91      0.91      0.91       419

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

XGBoost¶

In [33]:
# Add this import
from sklearn.metrics import log_loss
from sklearn.base import clone
from sklearn.model_selection import learning_curve

# Add this function before the model training
def plot_learning_curves(estimator, X, y, title):
    """
    Plot both accuracy and log loss learning curves for training and validation sets.
    """
    train_sizes = np.linspace(0.1, 1.0, 10)
    
    # Get accuracy scores
    train_sizes_acc, train_scores_acc, test_scores_acc = learning_curve(
        estimator, X, y,
        train_sizes=train_sizes,
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )
    
    # Calculate log loss manually
    train_scores_loss = []
    test_scores_loss = []
    
    for train_idx, valid_idx in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X, y):
        X_train_cv, X_valid_cv = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_cv, y_valid_cv = y.iloc[train_idx], y.iloc[valid_idx]
        
        train_loss_fold = []
        valid_loss_fold = []
        
        for size in train_sizes:
            subset_size = int(len(X_train_cv) * size)
            X_subset = X_train_cv.iloc[:subset_size]
            y_subset = y_train_cv.iloc[:subset_size]
            
            model_clone = clone(estimator)
            model_clone.fit(X_subset, y_subset)
            
            train_proba = model_clone.predict_proba(X_subset)
            valid_proba = model_clone.predict_proba(X_valid_cv)
            
            train_loss = log_loss(y_subset, train_proba)
            valid_loss = log_loss(y_valid_cv, valid_proba)
            
            train_loss_fold.append(train_loss)
            valid_loss_fold.append(valid_loss)
        
        train_scores_loss.append(train_loss_fold)
        test_scores_loss.append(valid_loss_fold)
    
    train_scores_loss = np.array(train_scores_loss)
    test_scores_loss = np.array(test_scores_loss)
    
    # Calculate statistics
    train_mean_acc = np.mean(train_scores_acc, axis=1)
    train_std_acc = np.std(train_scores_acc, axis=1)
    test_mean_acc = np.mean(test_scores_acc, axis=1)
    test_std_acc = np.std(test_scores_acc, axis=1)
    
    train_mean_loss = np.mean(train_scores_loss, axis=0)
    train_std_loss = np.std(train_scores_loss, axis=0)
    test_mean_loss = np.mean(test_scores_loss, axis=0)
    test_std_loss = np.std(test_scores_loss, axis=0)
    
    # Create subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot accuracy
    ax1.plot(train_sizes, train_mean_acc, 'o-', color='r', label='Training accuracy')
    ax1.fill_between(train_sizes, train_mean_acc - train_std_acc, 
                     train_mean_acc + train_std_acc, alpha=0.1, color='r')
    ax1.plot(train_sizes, test_mean_acc, 'o-', color='g', label='Cross-validation accuracy')
    ax1.fill_between(train_sizes, test_mean_acc - test_std_acc,
                     test_mean_acc + test_std_acc, alpha=0.1, color='g')
    ax1.set_xlabel('Training examples')
    ax1.set_ylabel('Accuracy')
    ax1.set_title('Learning Curve - Accuracy')
    ax1.legend(loc='best')
    ax1.grid(True)
    
    # Plot loss
    ax2.plot(train_sizes, train_mean_loss, 'o-', color='r', label='Training loss')
    ax2.fill_between(train_sizes, train_mean_loss - train_std_loss,
                     train_mean_loss + train_std_loss, alpha=0.1, color='r')
    ax2.plot(train_sizes, test_mean_loss, 'o-', color='g', label='Cross-validation loss')
    ax2.fill_between(train_sizes, test_mean_loss - test_std_loss,
                     test_mean_loss + test_std_loss, alpha=0.1, color='g')
    ax2.set_xlabel('Training examples')
    ax2.set_ylabel('Log Loss')
    ax2.set_title('Learning Curve - Log Loss')
    ax2.legend(loc='best')
    ax2.grid(True)
    
    plt.suptitle(title)
    plt.tight_layout()
    plt.show()
    
    # Print final scores
    print(f"\nFinal scores:")
    print(f"Training Accuracy: {train_mean_acc[-1]:.4f} ± {train_std_acc[-1]:.4f}")
    print(f"CV Accuracy: {test_mean_acc[-1]:.4f} ± {test_std_acc[-1]:.4f}")
    print(f"Training Loss: {train_mean_loss[-1]:.4f} ± {train_std_loss[-1]:.4f}")
    print(f"CV Loss: {test_mean_loss[-1]:.4f} ± {test_std_loss[-1]:.4f}")

# Add this after getting the best model
print("\nGenerating learning curves...")
plot_learning_curves(best_model, X, y, "XGBoost Learning Curves")
Generating learning curves...
No description has been provided for this image
Final scores:
Training Accuracy: 0.9835 ± 0.0029
CV Accuracy: 0.8921 ± 0.0858
Training Loss: 0.1026 ± 0.0027
CV Loss: 0.1892 ± 0.0176
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s

XGBoost Learning Curves¶

In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, log_loss
from sklearn.base import clone
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

def plot_learning_curves(estimator, X, y, title):
    """
    Plot both accuracy and log loss learning curves for training and validation sets.
    Adapted to work with sklearn Pipeline.
    """
    train_sizes = np.linspace(0.1, 1.0, 10)
    
    # Get accuracy scores
    train_sizes_acc, train_scores_acc, test_scores_acc = learning_curve(
        estimator, X, y,
        train_sizes=train_sizes,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1,
        scoring='accuracy'
    )
    
    # Calculate log loss manually
    train_scores_loss = []
    test_scores_loss = []
    
    for train_idx, valid_idx in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X, y):
        X_train_cv, X_valid_cv = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_cv, y_valid_cv = y.iloc[train_idx], y.iloc[valid_idx]
        
        train_loss_fold = []
        valid_loss_fold = []
        
        for size in train_sizes:
            subset_size = int(len(X_train_cv) * size)
            X_subset = X_train_cv.iloc[:subset_size]
            y_subset = y_train_cv.iloc[:subset_size]
            
            model_clone = clone(estimator)
            model_clone.fit(X_subset, y_subset)
            
            train_proba = model_clone.predict_proba(X_subset)
            valid_proba = model_clone.predict_proba(X_valid_cv)
            
            train_loss = log_loss(y_subset, train_proba)
            valid_loss = log_loss(y_valid_cv, valid_proba)
            
            train_loss_fold.append(train_loss)
            valid_loss_fold.append(valid_loss)
        
        train_scores_loss.append(train_loss_fold)
        test_scores_loss.append(valid_loss_fold)
    
    train_scores_loss = np.array(train_scores_loss)
    test_scores_loss = np.array(test_scores_loss)
    
    # Calculate statistics
    train_mean_acc = np.mean(train_scores_acc, axis=1)
    train_std_acc = np.std(train_scores_acc, axis=1)
    test_mean_acc = np.mean(test_scores_acc, axis=1)
    test_std_acc = np.std(test_scores_acc, axis=1)
    
    train_mean_loss = np.mean(train_scores_loss, axis=0)
    train_std_loss = np.std(train_scores_loss, axis=0)
    test_mean_loss = np.mean(test_scores_loss, axis=0)
    test_std_loss = np.std(test_scores_loss, axis=0)
    
    # Create subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle(title, fontsize=14, y=1.05)
    
    # Plot accuracy
    ax1.plot(train_sizes, train_mean_acc, 'o-', color='#2ecc71', 
            label='Training accuracy', linewidth=2, markersize=6)
    ax1.fill_between(train_sizes, train_mean_acc - train_std_acc,
                    train_mean_acc + train_std_acc, alpha=0.15, color='#2ecc71')
    ax1.plot(train_sizes, test_mean_acc, 'o-', color='#e74c3c',
            label='Cross-validation accuracy', linewidth=2, markersize=6)
    ax1.fill_between(train_sizes, test_mean_acc - test_std_acc,
                    test_mean_acc + test_std_acc, alpha=0.15, color='#e74c3c')
    ax1.set_xlabel('Training examples', fontsize=10)
    ax1.set_ylabel('Accuracy', fontsize=10)
    ax1.set_title('Learning Curve - Accuracy', fontsize=12)
    ax1.legend(loc='lower right', fontsize=9)
    ax1.grid(True, alpha=0.3)
    
    # Plot loss
    ax2.plot(train_sizes, train_mean_loss, 'o-', color='#2ecc71',
            label='Training loss', linewidth=2, markersize=6)
    ax2.fill_between(train_sizes, train_mean_loss - train_std_loss,
                    train_mean_loss + train_std_loss, alpha=0.15, color='#2ecc71')
    ax2.plot(train_sizes, test_mean_loss, 'o-', color='#e74c3c',
            label='Cross-validation loss', linewidth=2, markersize=6)
    ax2.fill_between(train_sizes, test_mean_loss - test_std_loss,
                    test_mean_loss + test_std_loss, alpha=0.15, color='#e74c3c')
    ax2.set_xlabel('Training examples', fontsize=10)
    ax2.set_ylabel('Log Loss', fontsize=10)
    ax2.set_title('Learning Curve - Log Loss', fontsize=12)
    ax2.legend(loc='upper right', fontsize=9)
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print final scores with improved formatting
    print("\nFinal Scores:")
    print("-" * 50)
    print(f"Training Accuracy:     {train_mean_acc[-1]:.4f} ± {train_std_acc[-1]:.4f}")
    print(f"CV Accuracy:          {test_mean_acc[-1]:.4f} ± {test_std_acc[-1]:.4f}")
    print(f"Training Loss:        {train_mean_loss[-1]:.4f} ± {train_std_loss[-1]:.4f}")
    print(f"CV Loss:             {test_mean_loss[-1]:.4f} ± {test_std_loss[-1]:.4f}")
    print(f"Overfitting Gap:      {train_mean_acc[-1] - test_mean_acc[-1]:.4f}")
    
    # Calculate log loss manually
    train_scores_loss = []
    test_scores_loss = []
    
    for train_idx, valid_idx in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X, y):
        X_train_cv, X_valid_cv = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_cv, y_valid_cv = y.iloc[train_idx], y.iloc[valid_idx]
        
        train_loss_fold = []
        valid_loss_fold = []
        
        for size in train_sizes:
            subset_size = int(len(X_train_cv) * size)
            X_subset = X_train_cv.iloc[:subset_size]
            y_subset = y_train_cv.iloc[:subset_size]
            
            model_clone = clone(estimator)
            model_clone.fit(X_subset, y_subset)
            
            train_proba = model_clone.predict_proba(X_subset)
            valid_proba = model_clone.predict_proba(X_valid_cv)
            
            train_loss = log_loss(y_subset, train_proba)
            valid_loss = log_loss(y_valid_cv, valid_proba)
            
            train_loss_fold.append(train_loss)
            valid_loss_fold.append(valid_loss)
        
        train_scores_loss.append(train_loss_fold)
        test_scores_loss.append(valid_loss_fold)
    
    train_scores_loss = np.array(train_scores_loss)
    test_scores_loss = np.array(test_scores_loss)
    
    # Calculate statistics
    train_mean_acc = np.mean(train_scores_acc, axis=1)
    train_std_acc = np.std(train_scores_acc, axis=1)
    test_mean_acc = np.mean(test_scores_acc, axis=1)
    test_std_acc = np.std(test_scores_acc, axis=1)
    
    train_mean_loss = np.mean(train_scores_loss, axis=0)
    train_std_loss = np.std(train_scores_loss, axis=0)
    test_mean_loss = np.mean(test_scores_loss, axis=0)
    test_std_loss = np.std(test_scores_loss, axis=0)
    
    # Create subplots with improved styling
    plt.style.use('seaborn')
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle(title, fontsize=14, y=1.05)
    
    # Plot accuracy
    ax1.plot(train_sizes, train_mean_acc, 'o-', color='#2ecc71', 
            label='Training accuracy', linewidth=2, markersize=6)
    ax1.fill_between(train_sizes, train_mean_acc - train_std_acc,
                    train_mean_acc + train_std_acc, alpha=0.15, color='#2ecc71')
    ax1.plot(train_sizes, test_mean_acc, 'o-', color='#e74c3c',
            label='Cross-validation accuracy', linewidth=2, markersize=6)
    ax1.fill_between(train_sizes, test_mean_acc - test_std_acc,
                    test_mean_acc + test_std_acc, alpha=0.15, color='#e74c3c')
    ax1.set_xlabel('Training examples', fontsize=10)
    ax1.set_ylabel('Accuracy', fontsize=10)
    ax1.set_title('Learning Curve - Accuracy', fontsize=12)
    ax1.legend(loc='lower right', fontsize=9)
    ax1.grid(True, linestyle='--', alpha=0.7)
    
    # Plot loss
    ax2.plot(train_sizes, train_mean_loss, 'o-', color='#2ecc71',
            label='Training loss', linewidth=2, markersize=6)
    ax2.fill_between(train_sizes, train_mean_loss - train_std_loss,
                    train_mean_loss + train_std_loss, alpha=0.15, color='#2ecc71')
    ax2.plot(train_sizes, test_mean_loss, 'o-', color='#e74c3c',
            label='Cross-validation loss', linewidth=2, markersize=6)
    ax2.fill_between(train_sizes, test_mean_loss - test_std_loss,
                    test_mean_loss + test_std_loss, alpha=0.15, color='#e74c3c')
    ax2.set_xlabel('Training examples', fontsize=10)
    ax2.set_ylabel('Log Loss', fontsize=10)
    ax2.set_title('Learning Curve - Log Loss', fontsize=12)
    ax2.legend(loc='upper right', fontsize=9)
    ax2.grid(True, linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()
    
    # Print final scores with improved formatting
    print("\nFinal Scores:")
    print("-" * 50)
    print(f"Training Accuracy:     {train_mean_acc[-1]:.4f} ± {train_std_acc[-1]:.4f}")
    print(f"CV Accuracy:          {test_mean_acc[-1]:.4f} ± {test_std_acc[-1]:.4f}")
    print(f"Training Loss:        {train_mean_loss[-1]:.4f} ± {train_std_loss[-1]:.4f}")
    print(f"CV Loss:             {test_mean_loss[-1]:.4f} ± {test_std_loss[-1]:.4f}")
    print(f"Overfitting Gap:      {train_mean_acc[-1] - test_mean_acc[-1]:.4f}")

# Create pipeline with modified configuration
xgb_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBClassifier(
        random_state=42,
        objective='binary:logistic',
        scale_pos_weight=1,
        enable_categorical=False
    ))
])

# Simplified parameter grid with better defaults for preventing overfitting
xgb_param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 4],
    'xgb__learning_rate': [0.01, 0.05],
    'xgb__min_child_weight': [5],
    'xgb__subsample': [0.7],
    'xgb__colsample_bytree': [0.7],
    'xgb__gamma': [0.1],
    'xgb__reg_alpha': [1],
    'xgb__reg_lambda': [2]
}

# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit the model
print("Starting model training...")
grid_search.fit(X_train, y_train)

# Print results
print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Get best model
best_model = grid_search.best_estimator_

# Generate learning curves
print("\nGenerating learning curves...")
plot_learning_curves(best_model, X, y, "XGBoost Learning Curves")

# Evaluate on test set
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.named_steps['xgb'].feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
plt.title('Feature Importance')
plt.tight_layout()
plt.show()
Starting model training...
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Best parameters: {'xgb__colsample_bytree': 0.7, 'xgb__gamma': 0.1, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 4, 'xgb__min_child_weight': 5, 'xgb__n_estimators': 200, 'xgb__reg_alpha': 1, 'xgb__reg_lambda': 2, 'xgb__subsample': 0.7}
Best cross-validation score: 0.9754360502580512

Generating learning curves...
No description has been provided for this image
Final Scores:
--------------------------------------------------
Training Accuracy:     0.9816 ± 0.0016
CV Accuracy:          0.9125 ± 0.0123
Training Loss:        0.1026 ± 0.0027
CV Loss:             0.1892 ± 0.0176
Overfitting Gap:      0.0691
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
File ~/anaconda3/envs/CAB420/lib/python3.12/site-packages/matplotlib/style/core.py:137, in use(style)
    136 try:
--> 137     style = _rc_params_in_file(style)
    138 except OSError as err:

File ~/anaconda3/envs/CAB420/lib/python3.12/site-packages/matplotlib/__init__.py:866, in _rc_params_in_file(fname, transform, fail_on_error)
    865 rc_temp = {}
--> 866 with _open_file_or_url(fname) as fd:
    867     try:

File ~/anaconda3/envs/CAB420/lib/python3.12/contextlib.py:137, in _GeneratorContextManager.__enter__(self)
    136 try:
--> 137     return next(self.gen)
    138 except StopIteration:

File ~/anaconda3/envs/CAB420/lib/python3.12/site-packages/matplotlib/__init__.py:843, in _open_file_or_url(fname)
    842 fname = os.path.expanduser(fname)
--> 843 with open(fname, encoding='utf-8') as f:
    844     yield f

FileNotFoundError: [Errno 2] No such file or directory: 'seaborn'

The above exception was the direct cause of the following exception:

OSError                                   Traceback (most recent call last)
Cell In[35], line 272
    270 # Generate learning curves
    271 print("\nGenerating learning curves...")
--> 272 plot_learning_curves(best_model, X, y, "XGBoost Learning Curves")
    274 # Evaluate on test set
    275 y_pred = best_model.predict(X_test)

Cell In[35], line 166, in plot_learning_curves(estimator, X, y, title)
    163 test_std_loss = np.std(test_scores_loss, axis=0)
    165 # Create subplots with improved styling
--> 166 plt.style.use('seaborn')
    167 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    168 fig.suptitle(title, fontsize=14, y=1.05)

File ~/anaconda3/envs/CAB420/lib/python3.12/site-packages/matplotlib/style/core.py:139, in use(style)
    137         style = _rc_params_in_file(style)
    138     except OSError as err:
--> 139         raise OSError(
    140             f"{style!r} is not a valid package style, path of style "
    141             f"file, URL of style file, or library style name (library "
    142             f"styles are listed in `style.available`)") from err
    143 filtered = {}
    144 for k in style:  # don't trigger RcParams.__getitem__('backend')

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.3s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s

Overfitted XGBoost Model¶

In [37]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (roc_curve, auc, confusion_matrix, 
                           classification_report, accuracy_score)
from sklearn.model_selection import learning_curve
import seaborn as sns
from scipy import stats

def evaluate_model_comprehensive(model, X, y, X_test, y_test, title="Model Evaluation", figsize=(15, 10)):
    """
    Comprehensive model evaluation generating multiple metrics and visualizations.
    
    Parameters:
    -----------
    model : estimator object
        Trained model
    X : array-like
        Training features
    y : array-like
        Training labels
    X_test : array-like
        Test features
    y_test : array-like
        Test labels
    title : str
        Title for the plots
    figsize : tuple
        Figure size for the combined plots
    """
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Calculate confidence interval for accuracy
    accuracy = accuracy_score(y_test, y_pred)
    n = len(y_test)
    confidence_interval = 1.96 * np.sqrt((accuracy * (1 - accuracy)) / n)
    
    # Calculate standard error
    std_error = np.sqrt((accuracy * (1 - accuracy)) / n)
    
    # Get learning curves
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, 
        cv=5, 
        n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='accuracy'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    # Create figure with subplots
    fig = plt.figure(figsize=figsize)
    gs = fig.add_gridspec(2, 2)
    
    # 1. ROC Curve
    ax1 = fig.add_subplot(gs[0, 0])
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    ax1.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC curve (AUC = {roc_auc:.2f})')
    ax1.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax1.set_xlim([0.0, 1.0])
    ax1.set_ylim([0.0, 1.05])
    ax1.set_xlabel('False Positive Rate')
    ax1.set_ylabel('True Positive Rate')
    ax1.set_title('ROC Curve')
    ax1.legend(loc="lower right")
    ax1.grid(True, alpha=0.3)
    
    # 2. Learning Curves
    ax2 = fig.add_subplot(gs[0, 1])
    ax2.plot(train_sizes, train_mean, 'o-', color='#2ecc71',
             label='Training accuracy')
    ax2.fill_between(train_sizes, train_mean - train_std,
                     train_mean + train_std, alpha=0.15, color='#2ecc71')
    ax2.plot(train_sizes, test_mean, 'o-', color='#e74c3c',
             label='Cross-validation accuracy')
    ax2.fill_between(train_sizes, test_mean - test_std,
                     test_mean + test_std, alpha=0.15, color='#e74c3c')
    ax2.set_xlabel('Training examples')
    ax2.set_ylabel('Accuracy')
    ax2.set_title('Learning Curves')
    ax2.legend(loc='lower right')
    ax2.grid(True, alpha=0.3)
    
    # 3. Confusion Matrix
    ax3 = fig.add_subplot(gs[1, 0])
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax3)
    ax3.set_title('Confusion Matrix')
    ax3.set_ylabel('True Label')
    ax3.set_xlabel('Predicted Label')
    
    # 4. Metrics Text
    ax4 = fig.add_subplot(gs[1, 1])
    ax4.axis('off')
    
    # Calculate metrics
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    # Prepare metrics text
    metrics_text = f"""
    Model Performance Metrics:
    
    Accuracy: {accuracy:.4f} ± {confidence_interval:.4f}
    95% CI: [{accuracy-confidence_interval:.4f}, {accuracy+confidence_interval:.4f}]
    Standard Error: {std_error:.4f}
    Standard Deviation: {np.std(test_scores):.4f}
    
    Detailed Metrics:
    Precision (Weighted): {classification_rep['weighted avg']['precision']:.4f}
    Recall (Weighted): {classification_rep['weighted avg']['recall']:.4f}
    F1-Score (Weighted): {classification_rep['weighted avg']['f1-score']:.4f}
    
    ROC AUC Score: {roc_auc:.4f}
    
    Training-Validation Gap: {train_mean[-1] - test_mean[-1]:.4f}
    """
    
    ax4.text(0, 1, metrics_text, fontsize=10, va='top', ha='left', 
             bbox=dict(facecolor='white', alpha=0.8))
    
    plt.suptitle(title, y=1.02, fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # Print detailed metrics
    print("\nDetailed Performance Metrics:")
    print("-" * 50)
    print(f"Mean Accuracy ± CI: {accuracy:.4f} ± {confidence_interval:.4f}")
    print(f"Standard Deviation: {np.std(test_scores):.4f}")
    print(f"Standard Error: {std_error:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return {
        'accuracy': accuracy,
        'confidence_interval': confidence_interval,
        'std_dev': np.std(test_scores),
        'std_error': std_error,
        'roc_auc': roc_auc,
        'confusion_matrix': cm,
        'classification_report': classification_report(y_test, y_pred, output_dict=True)
    }
In [38]:
# Your existing code...

# Get best model
best_model = grid_search.best_estimator_

# Add the comprehensive evaluation here
print("\nGenerating comprehensive evaluation...")
metrics = evaluate_model_comprehensive(
    model=best_model,
    X=X,
    y=y,
    X_test=X_test,
    y_test=y_test,
    title="XGBoost Model Evaluation"
)

# Print specific metrics if desired
print(f"Accuracy with CI: {metrics['accuracy']:.4f} ± {metrics['confidence_interval']:.4f}")

# Your existing evaluation code can continue here...
print("\nGenerating learning curves...")
plot_learning_curves(best_model, X, y, "XGBoost Learning Curves")

# Evaluate on test set
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]
Generating comprehensive evaluation...
No description has been provided for this image
Detailed Performance Metrics:
--------------------------------------------------
Mean Accuracy ± CI: 0.9141 ± 0.0268
Standard Deviation: 0.1015
Standard Error: 0.0137

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.92       236
           1       0.91      0.90      0.90       183

    accuracy                           0.91       419
   macro avg       0.91      0.91      0.91       419
weighted avg       0.91      0.91      0.91       419

Accuracy with CI: 0.9141 ± 0.0268

Generating learning curves...
No description has been provided for this image
Final Scores:
--------------------------------------------------
Training Accuracy:     0.9816 ± 0.0016
CV Accuracy:          0.9125 ± 0.0123
Training Loss:        0.1026 ± 0.0027
CV Loss:             0.1892 ± 0.0176
Overfitting Gap:      0.0691
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
File ~/anaconda3/envs/CAB420/lib/python3.12/site-packages/matplotlib/style/core.py:137, in use(style)
    136 try:
--> 137     style = _rc_params_in_file(style)
    138 except OSError as err:

File ~/anaconda3/envs/CAB420/lib/python3.12/site-packages/matplotlib/__init__.py:866, in _rc_params_in_file(fname, transform, fail_on_error)
    865 rc_temp = {}
--> 866 with _open_file_or_url(fname) as fd:
    867     try:

File ~/anaconda3/envs/CAB420/lib/python3.12/contextlib.py:137, in _GeneratorContextManager.__enter__(self)
    136 try:
--> 137     return next(self.gen)
    138 except StopIteration:

File ~/anaconda3/envs/CAB420/lib/python3.12/site-packages/matplotlib/__init__.py:843, in _open_file_or_url(fname)
    842 fname = os.path.expanduser(fname)
--> 843 with open(fname, encoding='utf-8') as f:
    844     yield f

FileNotFoundError: [Errno 2] No such file or directory: 'seaborn'

The above exception was the direct cause of the following exception:

OSError                                   Traceback (most recent call last)
Cell In[38], line 22
     20 # Your existing evaluation code can continue here...
     21 print("\nGenerating learning curves...")
---> 22 plot_learning_curves(best_model, X, y, "XGBoost Learning Curves")
     24 # Evaluate on test set
     25 y_pred = best_model.predict(X_test)

Cell In[35], line 166, in plot_learning_curves(estimator, X, y, title)
    163 test_std_loss = np.std(test_scores_loss, axis=0)
    165 # Create subplots with improved styling
--> 166 plt.style.use('seaborn')
    167 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    168 fig.suptitle(title, fontsize=14, y=1.05)

File ~/anaconda3/envs/CAB420/lib/python3.12/site-packages/matplotlib/style/core.py:139, in use(style)
    137         style = _rc_params_in_file(style)
    138     except OSError as err:
--> 139         raise OSError(
    140             f"{style!r} is not a valid package style, path of style "
    141             f"file, URL of style file, or library style name (library "
    142             f"styles are listed in `style.available`)") from err
    143 filtered = {}
    144 for k in style:  # don't trigger RcParams.__getitem__('backend')

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

Improving XGBoostModel¶

In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (roc_curve, auc, confusion_matrix, 
                           classification_report, accuracy_score)
from sklearn.base import clone
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

def plot_learning_curves(estimator, X, y, title, save_plots=True, output_dir='plots'):
    """Plot learning curves for model training and validation."""
    train_sizes = np.linspace(0.1, 1.0, 10)
    
    # Get accuracy scores
    train_sizes_acc, train_scores_acc, test_scores_acc = learning_curve(
        estimator, X, y,
        train_sizes=train_sizes,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1,
        scoring='accuracy'
    )
    
    # Calculate statistics
    train_mean_acc = np.mean(train_scores_acc, axis=1)
    train_std_acc = np.std(train_scores_acc, axis=1)
    test_mean_acc = np.mean(test_scores_acc, axis=1)
    test_std_acc = np.std(test_scores_acc, axis=1)
    
    # Create plot
    plt.figure(figsize=(15, 6))
    plt.subplot(1, 2, 1)
    plt.plot(train_sizes, train_mean_acc, 'o-', color='#2ecc71', label='Training accuracy')
    plt.fill_between(train_sizes, train_mean_acc - train_std_acc,
                    train_mean_acc + train_std_acc, alpha=0.15, color='#2ecc71')
    plt.plot(train_sizes, test_mean_acc, 'o-', color='#e74c3c', label='Cross-validation accuracy')
    plt.fill_between(train_sizes, test_mean_acc - test_std_acc,
                    test_mean_acc + test_std_acc, alpha=0.15, color='#e74c3c')
    plt.xlabel('Training examples')
    plt.ylabel('Accuracy')
    plt.title('Learning Curve - Accuracy')
    plt.legend(loc='lower right')
    plt.grid(True, alpha=0.3)
    
    if save_plots:
        plt.savefig(f'{output_dir}/learning_curves.png', dpi=300, bbox_inches='tight')
    
    plt.tight_layout()
    plt.show()
    
    # Print scores
    print("\nFinal Scores:")
    print("-" * 50)
    print(f"Training Accuracy:     {train_mean_acc[-1]:.4f} ± {train_std_acc[-1]:.4f}")
    print(f"CV Accuracy:          {test_mean_acc[-1]:.4f} ± {test_std_acc[-1]:.4f}")
    print(f"Overfitting Gap:      {train_mean_acc[-1] - test_mean_acc[-1]:.4f}")

def evaluate_model_comprehensive(model, X, y, X_test, y_test, title="Model Evaluation", 
                               figsize=(15, 10), save_plots=True, output_dir='plots'):
    """Comprehensive model evaluation with all metrics and visualizations."""
    
    if save_plots:
        os.makedirs(output_dir, exist_ok=True)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    n = len(y_test)
    confidence_interval = 1.96 * np.sqrt((accuracy * (1 - accuracy)) / n)
    std_error = np.sqrt((accuracy * (1 - accuracy)) / n)
    
    # Create main figure
    fig = plt.figure(figsize=(18, 15))
    gs = fig.add_gridspec(3, 2)
    
    # 1. ROC Curve
    ax1 = fig.add_subplot(gs[0, 0])
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    ax1.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    ax1.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax1.set_xlim([0.0, 1.0])
    ax1.set_ylim([0.0, 1.05])
    ax1.set_xlabel('False Positive Rate')
    ax1.set_ylabel('True Positive Rate')
    ax1.set_title('ROC Curve')
    ax1.legend(loc="lower right")
    ax1.grid(True, alpha=0.3)
    
    # 2. Confusion Matrix
    ax2 = fig.add_subplot(gs[0, 1])
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2)
    ax2.set_title('Confusion Matrix')
    ax2.set_ylabel('True Label')
    ax2.set_xlabel('Predicted Label')
    
    # 3. Feature Importance
    ax3 = fig.add_subplot(gs[1, :])
    
    if hasattr(model, 'named_steps') and 'xgb' in model.named_steps:
        importance = model.named_steps['xgb'].feature_importances_
    else:
        importance = model.feature_importances_
    
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': importance
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    
    sns.barplot(x='importance', y='feature', data=feature_importance.head(20), ax=ax3)
    ax3.set_title('Top 20 Feature Importance')
    ax3.set_xlabel('Importance')
    ax3.set_ylabel('Feature')
    
    # 4. Metrics Text
    ax4 = fig.add_subplot(gs[2, :])
    ax4.axis('off')
    
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    metrics_text = f"""
    Model Performance Metrics:
    
    Accuracy: {accuracy:.4f} ± {confidence_interval:.4f}
    95% CI: [{accuracy-confidence_interval:.4f}, {accuracy+confidence_interval:.4f}]
    Standard Error: {std_error:.4f}
    
    Detailed Metrics:
    Precision (Weighted): {classification_rep['weighted avg']['precision']:.4f}
    Recall (Weighted): {classification_rep['weighted avg']['recall']:.4f}
    F1-Score (Weighted): {classification_rep['weighted avg']['f1-score']:.4f}
    
    ROC AUC Score: {roc_auc:.4f}
    """
    
    ax4.text(0, 1, metrics_text, fontsize=12, va='top', ha='left',
             bbox=dict(facecolor='white', alpha=0.8))
    
    plt.suptitle(title, y=1.02, fontsize=16)
    plt.tight_layout()
    
    if save_plots:
        # Save individual plots
        plt.savefig(f'{output_dir}/complete_evaluation.png', dpi=300, bbox_inches='tight')
        feature_importance.to_csv(f'{output_dir}/feature_importance.csv', index=False)
        
        # Save metrics to text file
        with open(f'{output_dir}/model_metrics.txt', 'w') as f:
            f.write("Model Evaluation Results\n")
            f.write("=" * 50 + "\n\n")
            f.write("Performance Metrics:\n")
            f.write("-" * 30 + "\n")
            f.write(f"Accuracy: {accuracy:.4f} ± {confidence_interval:.4f}\n")
            f.write(f"95% CI: [{accuracy-confidence_interval:.4f}, {accuracy+confidence_interval:.4f}]\n")
            f.write(f"Standard Error: {std_error:.4f}\n")
            f.write(f"ROC AUC Score: {roc_auc:.4f}\n\n")
            f.write("Classification Report:\n")
            f.write("-" * 30 + "\n")
            f.write(classification_report(y_test, y_pred))
            f.write("\nTop 20 Most Important Features:\n")
            f.write("-" * 30 + "\n")
            for idx, row in feature_importance.head(20).iterrows():
                f.write(f"{row['feature']}: {row['importance']:.4f}\n")
    
    plt.show()
    
    return {
        'accuracy': accuracy,
        'confidence_interval': confidence_interval,
        'std_error': std_error,
        'roc_auc': roc_auc,
        'confusion_matrix': cm,
        'feature_importance': feature_importance,
        'classification_report': classification_report(y_test, y_pred, output_dict=True)
    }

# Create pipeline
xgb_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBClassifier(
        random_state=42,
        objective='binary:logistic',
        scale_pos_weight=1,
        enable_categorical=False
    ))
])

# Parameter grid
xgb_param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 4],
    'xgb__learning_rate': [0.01, 0.05],
    'xgb__min_child_weight': [5],
    'xgb__subsample': [0.7],
    'xgb__colsample_bytree': [0.7],
    'xgb__gamma': [0.1],
    'xgb__reg_alpha': [1],
    'xgb__reg_lambda': [2]
}

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit the model
print("Starting model training...")
grid_search.fit(X_train, y_train)

# Get best model
best_model = grid_search.best_estimator_

# Print results
print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Generate comprehensive evaluation
print("\nGenerating comprehensive evaluation...")
metrics = evaluate_model_comprehensive(
    model=best_model,
    X=X,
    y=y,
    X_test=X_test,
    y_test=y_test,
    title="XGBoost Model Evaluation",
    save_plots=True,
    output_dir='model_evaluation'
)

# Generate learning curves
print("\nGenerating learning curves...")
plot_learning_curves(best_model, X, y, "XGBoost Learning Curves", save_plots=True, output_dir='model_evaluation')
Starting model training...
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Best parameters: {'xgb__colsample_bytree': 0.7, 'xgb__gamma': 0.1, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 4, 'xgb__min_child_weight': 5, 'xgb__n_estimators': 200, 'xgb__reg_alpha': 1, 'xgb__reg_lambda': 2, 'xgb__subsample': 0.7}
Best cross-validation score: 0.9754360502580512

Generating comprehensive evaluation...
No description has been provided for this image
Generating learning curves...
No description has been provided for this image
Final Scores:
--------------------------------------------------
Training Accuracy:     0.9816 ± 0.0016
CV Accuracy:          0.9125 ± 0.0123
Overfitting Gap:      0.0691

Final XGBoost Model¶

In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (roc_curve, auc, confusion_matrix, 
                           classification_report, accuracy_score)
from sklearn.base import clone
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

def plot_learning_curves(estimator, X, y, title, save_plots=True, output_dir='plots'):
    """Plot learning curves for model training and validation."""
    train_sizes = np.linspace(0.1, 1.0, 10)
    
    # Get accuracy scores
    train_sizes_acc, train_scores_acc, test_scores_acc = learning_curve(
        estimator, X, y,
        train_sizes=train_sizes,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1,
        scoring='accuracy'
    )
    
    # Calculate statistics
    train_mean_acc = np.mean(train_scores_acc, axis=1)
    train_std_acc = np.std(train_scores_acc, axis=1)
    test_mean_acc = np.mean(test_scores_acc, axis=1)
    test_std_acc = np.std(test_scores_acc, axis=1)
    
    # Create plot
    plt.figure(figsize=(15, 6))
    plt.subplot(1, 2, 1)
    plt.plot(train_sizes, train_mean_acc, 'o-', color='#2ecc71', label='Training accuracy')
    plt.fill_between(train_sizes, train_mean_acc - train_std_acc,
                    train_mean_acc + train_std_acc, alpha=0.15, color='#2ecc71')
    plt.plot(train_sizes, test_mean_acc, 'o-', color='#e74c3c', label='Cross-validation accuracy')
    plt.fill_between(train_sizes, test_mean_acc - test_std_acc,
                    test_mean_acc + test_std_acc, alpha=0.15, color='#e74c3c')
    plt.xlabel('Training examples')
    plt.ylabel('Accuracy')
    plt.title('Learning Curve - Accuracy')
    plt.legend(loc='lower right')
    plt.grid(True, alpha=0.3)
    
    if save_plots:
        plt.savefig(f'{output_dir}/learning_curves.png', dpi=300, bbox_inches='tight')
    
    plt.tight_layout()
    plt.show()
    
    # Print scores
    print("\nFinal Scores:")
    print("-" * 50)
    print(f"Training Accuracy:     {train_mean_acc[-1]:.4f} ± {train_std_acc[-1]:.4f}")
    print(f"CV Accuracy:          {test_mean_acc[-1]:.4f} ± {test_std_acc[-1]:.4f}")
    print(f"Overfitting Gap:      {train_mean_acc[-1] - test_mean_acc[-1]:.4f}")

def evaluate_model_comprehensive(model, X, y, X_test, y_test, title="Model Evaluation", 
                               figsize=(15, 10), save_plots=True, output_dir='plots'):
    """Comprehensive model evaluation with all metrics and visualizations."""
    if save_plots:
        os.makedirs(output_dir, exist_ok=True)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    n = len(y_test)
    confidence_interval = 1.96 * np.sqrt((accuracy * (1 - accuracy)) / n)
    std_error = np.sqrt((accuracy * (1 - accuracy)) / n)
    
    # Create main figure
    fig = plt.figure(figsize=(18, 15))
    gs = fig.add_gridspec(3, 2)
    
    # 1. ROC Curve
    ax1 = fig.add_subplot(gs[0, 0])
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    ax1.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    ax1.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax1.set_xlim([0.0, 1.0])
    ax1.set_ylim([0.0, 1.05])
    ax1.set_xlabel('False Positive Rate')
    ax1.set_ylabel('True Positive Rate')
    ax1.set_title('ROC Curve')
    ax1.legend(loc="lower right")
    ax1.grid(True, alpha=0.3)
    
    # 2. Confusion Matrix
    ax2 = fig.add_subplot(gs[0, 1])
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2)
    ax2.set_title('Confusion Matrix')
    ax2.set_ylabel('True Label')
    ax2.set_xlabel('Predicted Label')
    
    # 3. Feature Importance
    ax3 = fig.add_subplot(gs[1, :])
    if hasattr(model, 'named_steps') and 'xgb' in model.named_steps:
        importance = model.named_steps['xgb'].feature_importances_
    else:
        importance = model.feature_importances_
    
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': importance
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    
    sns.barplot(x='importance', y='feature', data=feature_importance.head(20), ax=ax3)
    ax3.set_title('Top 20 Feature Importance')
    ax3.set_xlabel('Importance')
    ax3.set_ylabel('Feature')
    
    # 4. Metrics Text
    ax4 = fig.add_subplot(gs[2, :])
    ax4.axis('off')
    
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    metrics_text = f"""
    Model Performance Metrics:
    
    Accuracy: {accuracy:.4f} ± {confidence_interval:.4f}
    95% CI: [{accuracy-confidence_interval:.4f}, {accuracy+confidence_interval:.4f}]
    Standard Error: {std_error:.4f}
    
    Detailed Metrics:
    Precision (Weighted): {classification_rep['weighted avg']['precision']:.4f}
    Recall (Weighted): {classification_rep['weighted avg']['recall']:.4f}
    F1-Score (Weighted): {classification_rep['weighted avg']['f1-score']:.4f}
    
    ROC AUC Score: {roc_auc:.4f}
    """
    
    ax4.text(0, 1, metrics_text, fontsize=12, va='top', ha='left',
             bbox=dict(facecolor='white', alpha=0.8))
    
    plt.suptitle(title, y=1.02, fontsize=16)
    plt.tight_layout()
    
    if save_plots:
        plt.savefig(f'{output_dir}/complete_evaluation.png', dpi=300, bbox_inches='tight')
        feature_importance.to_csv(f'{output_dir}/feature_importance.csv', index=False)
        
        with open(f'{output_dir}/model_metrics.txt', 'w') as f:
            f.write("Model Evaluation Results\n")
            f.write("=" * 50 + "\n\n")
            f.write("Performance Metrics:\n")
            f.write("-" * 30 + "\n")
            f.write(f"Accuracy: {accuracy:.4f} ± {confidence_interval:.4f}\n")
            f.write(f"95% CI: [{accuracy-confidence_interval:.4f}, {accuracy+confidence_interval:.4f}]\n")
            f.write(f"Standard Error: {std_error:.4f}\n")
            f.write(f"ROC AUC Score: {roc_auc:.4f}\n\n")
            f.write("Classification Report:\n")
            f.write("-" * 30 + "\n")
            f.write(classification_report(y_test, y_pred))
            f.write("\nTop 20 Most Important Features:\n")
            f.write("-" * 30 + "\n")
            for idx, row in feature_importance.head(20).iterrows():
                f.write(f"{row['feature']}: {row['importance']:.4f}\n")
    
    plt.show()
    
    return {
        'accuracy': accuracy,
        'confidence_interval': confidence_interval,
        'std_error': std_error,
        'roc_auc': roc_auc,
        'confusion_matrix': cm,
        'feature_importance': feature_importance,
        'classification_report': classification_report(y_test, y_pred, output_dict=True)
    }

def train_and_evaluate_model(X, y, pipeline, param_grid, model_name="Model"):
    """Train and evaluate a model with given parameters."""
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42, 
        stratify=y
    )
    
    # Set up cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Perform Grid Search
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=2,
        return_train_score=True
    )
    
    # Fit the model
    print(f"\nTraining {model_name}...")
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Print results
    print(f"\nBest parameters for {model_name}:", grid_search.best_params_)
    print(f"Best cross-validation score:", grid_search.best_score_)
    
    # Generate comprehensive evaluation
    print(f"\nGenerating comprehensive evaluation for {model_name}...")
    metrics = evaluate_model_comprehensive(
        model=best_model,
        X=X,
        y=y,
        X_test=X_test,
        y_test=y_test,
        title=f"{model_name} Evaluation",
        save_plots=True,
        output_dir=f'{model_name.lower().replace(" ", "_")}_evaluation'
    )
    
    # Generate learning curves
    print(f"\nGenerating learning curves for {model_name}...")
    plot_learning_curves(
        best_model, 
        X, y, 
        f"{model_name} Learning Curves", 
        save_plots=True, 
        output_dir=f'{model_name.lower().replace(" ", "_")}_evaluation'
    )
    
    return best_model, metrics

# Define original model pipeline and parameters
original_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBClassifier(
        random_state=42,
        objective='binary:logistic',
        scale_pos_weight=1,
        enable_categorical=False
    ))
])

original_param_grid = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 4],
    'xgb__learning_rate': [0.01, 0.05],
    'xgb__min_child_weight': [5],
    'xgb__subsample': [0.7],
    'xgb__colsample_bytree': [0.7],
    'xgb__gamma': [0.1],
    'xgb__reg_alpha': [1],
    'xgb__reg_lambda': [2]
}

# Define conservative model pipeline and parameters
conservative_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('xgb', xgb.XGBClassifier(
        random_state=42,
        objective='binary:logistic',
        scale_pos_weight=1,
        enable_categorical=False
    ))
])

conservative_param_grid = {
    'xgb__n_estimators': [50, 100],
    'xgb__max_depth': [2, 3],
    'xgb__learning_rate': [0.01],
    'xgb__min_child_weight': [7, 10],
    'xgb__subsample': [0.6],
    'xgb__colsample_bytree': [0.6],
    'xgb__gamma': [0.2, 0.3],
    'xgb__reg_alpha': [2, 5],
    'xgb__reg_lambda': [5, 10]
}

# Train and evaluate both models
original_model, original_metrics = train_and_evaluate_model(
    X, y, 
    original_pipeline, 
    original_param_grid, 
    "Original XGBoost"
)

conservative_model, conservative_metrics = train_and_evaluate_model(
    X, y, 
    conservative_pipeline, 
    conservative_param_grid, 
    "Conservative XGBoost"
)

# Compare models
print("\nModel Comparison:")
print("-" * 50)
print(f"Original Model Accuracy: {original_metrics['accuracy']:.4f} ± {original_metrics['confidence_interval']:.4f}")
print(f"Conservative Model Accuracy: {conservative_metrics['accuracy']:.4f} ± {conservative_metrics['confidence_interval']:.4f}")
print(f"\nOriginal Model ROC AUC: {original_metrics['roc_auc']:.4f}")
print(f"Conservative Model ROC AUC: {conservative_metrics['roc_auc']:.4f}")
Training Original XGBoost...
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Best parameters for Original XGBoost: {'xgb__colsample_bytree': 0.7, 'xgb__gamma': 0.1, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 4, 'xgb__min_child_weight': 5, 'xgb__n_estimators': 200, 'xgb__reg_alpha': 1, 'xgb__reg_lambda': 2, 'xgb__subsample': 0.7}
Best cross-validation score: 0.9754360502580512

Generating comprehensive evaluation for Original XGBoost...
No description has been provided for this image
Generating learning curves for Original XGBoost...
No description has been provided for this image
Final Scores:
--------------------------------------------------
Training Accuracy:     0.9816 ± 0.0016
CV Accuracy:          0.9125 ± 0.0123
Overfitting Gap:      0.0691

Training Conservative XGBoost...
Fitting 5 folds for each of 64 candidates, totalling 320 fits

Best parameters for Conservative XGBoost: {'xgb__colsample_bytree': 0.6, 'xgb__gamma': 0.2, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 3, 'xgb__min_child_weight': 7, 'xgb__n_estimators': 100, 'xgb__reg_alpha': 2, 'xgb__reg_lambda': 5, 'xgb__subsample': 0.6}
Best cross-validation score: 0.9542537825389555

Generating comprehensive evaluation for Conservative XGBoost...
No description has been provided for this image
Generating learning curves for Conservative XGBoost...
No description has been provided for this image
Final Scores:
--------------------------------------------------
Training Accuracy:     0.8877 ± 0.0032
CV Accuracy:          0.8685 ± 0.0081
Overfitting Gap:      0.0192

Model Comparison:
--------------------------------------------------
Original Model Accuracy: 0.9141 ± 0.0268
Conservative Model Accuracy: 0.8473 ± 0.0344

Original Model ROC AUC: 0.9800
Conservative Model ROC AUC: 0.9452
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.2s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.01, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=3, xgb__min_child_weight=5, xgb__n_estimators=200, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.7, xgb__gamma=0.1, xgb__learning_rate=0.05, xgb__max_depth=4, xgb__min_child_weight=5, xgb__n_estimators=100, xgb__reg_alpha=1, xgb__reg_lambda=2, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.2, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=7, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=2, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=7, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=2, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=50, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.0s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=5, xgb__subsample=0.6; total time=   0.1s
[CV] END xgb__colsample_bytree=0.6, xgb__gamma=0.3, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__min_child_weight=10, xgb__n_estimators=100, xgb__reg_alpha=5, xgb__reg_lambda=10, xgb__subsample=0.6; total time=   0.0s